From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: NA CVE: NA
-------------------------------------------------
The commit 59a57a82fb2a ("mm/vmalloc: Hugepage vmalloc mappings") would enable the vmalloc for hugepage default when the alloc size is bigger than the PMD_SIZE, it looks like the transparent hugepage for mmap, the driver could not control the hugepage accurately and be break the logic, now the share pool already export the vmalloc_hugepage_xxx function to control the vmalloc hugepage allocation, it looks like the static hugepage for vmalloc, so disable the transparent hugepage function.
This patch also fix the problem of breaking the kabi of vm_struct, the user could applied it for commercial version.
Fixes: 59a57a82fb2a ("mm/vmalloc: Hugepage vmalloc mappings") Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/share_pool.h | 51 ++++++++++++++++++++++++++++---------- include/linux/vmalloc.h | 1 - mm/vmalloc.c | 47 ++++++++++++----------------------- 3 files changed, 54 insertions(+), 45 deletions(-)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index c3120b7b24948..4a18c88d5a10e 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -211,15 +211,6 @@ static inline void sp_area_work_around(struct vm_unmapped_area_info *info)
extern struct page *sp_alloc_pages(struct vm_struct *area, gfp_t mask, unsigned int page_order, int node); - -static inline void sp_free_pages(struct page *page, struct vm_struct *area) -{ - if (PageHuge(page)) - put_page(page); - else - __free_pages(page, area->page_order); -} - static inline bool sp_check_vm_share_pool(unsigned long vm_flags) { if (enable_ascend_share_pool && (vm_flags & VM_SHARE_POOL)) @@ -264,6 +255,30 @@ extern void *buff_vzalloc_hugepage_user(unsigned long size);
void sp_exit_mm(struct mm_struct *mm);
+static inline bool is_vmalloc_huge(unsigned long vm_flags) +{ + if (enable_ascend_share_pool && (vm_flags & VM_HUGE_PAGES)) + return true; + + return false; +} + +static inline bool is_vmalloc_sharepool(unsigned long vm_flags) +{ + if (enable_ascend_share_pool && (vm_flags & VM_SHAREPOOL)) + return true; + + return false; +} + +static inline void sp_free_pages(struct page *page, struct vm_struct *area) +{ + if (PageHuge(page)) + put_page(page); + else + __free_pages(page, is_vmalloc_huge(area->flags) ? PMD_SHIFT - PAGE_SHIFT : 0); +} + #else
static inline int sp_group_add_task(int pid, int spg_id) @@ -400,10 +415,6 @@ static inline struct page *sp_alloc_pages(void *area, gfp_t mask, return NULL; }
-static inline void sp_free_pages(struct page *page, struct vm_struct *area) -{ -} - static inline bool sp_check_vm_share_pool(unsigned long vm_flags) { return false; @@ -448,6 +459,20 @@ static inline void *buff_vzalloc_hugepage_user(unsigned long size) return NULL; }
+static inline bool is_vmalloc_huge(struct vm_struct *vm) +{ + return NULL; +} + +static inline bool is_vmalloc_sharepool(struct vm_struct *vm) +{ + return NULL; +} + +static inline void sp_free_pages(struct page *page, struct vm_struct *area) +{ +} + #endif
#endif /* LINUX_SHARE_POOL_H */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index bb814f6418fd9..298eff5579b21 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -43,7 +43,6 @@ struct vm_struct { unsigned long size; unsigned long flags; struct page **pages; - unsigned int page_order; unsigned int nr_pages; phys_addr_t phys_addr; const void *caller; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 37b4762871142..8c70131e0b078 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2354,6 +2354,7 @@ struct vm_struct *remove_vm_area(const void *addr) static void __vunmap(const void *addr, int deallocate_pages) { struct vm_struct *area; + unsigned int page_order = 0;
if (!addr) return; @@ -2369,13 +2370,14 @@ static void __vunmap(const void *addr, int deallocate_pages) return; }
-#ifdef CONFIG_ASCEND_SHARE_POOL /* unmap a sharepool vm area will cause meamleak! */ - if (area->flags & VM_SHAREPOOL) { + if (is_vmalloc_sharepool(area->flags)) { WARN(1, KERN_ERR "Memory leak due to vfree() sharepool vm area (%p) !\n", addr); return; } -#endif + + if (is_vmalloc_huge(area->flags)) + page_order = PMD_SHIFT - PAGE_SHIFT;
debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); @@ -2384,14 +2386,14 @@ static void __vunmap(const void *addr, int deallocate_pages) if (deallocate_pages) { int i;
- for (i = 0; i < area->nr_pages; i += 1U << area->page_order) { + for (i = 0; i < area->nr_pages; i += 1U << page_order) { struct page *page = area->pages[i];
BUG_ON(!page); if (sp_is_enabled()) sp_free_pages(page, area); else - __free_pages(page, area->page_order); + __free_pages(page, page_order); }
kvfree(area->pages); @@ -2589,7 +2591,6 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
area->pages = pages; area->nr_pages = nr_pages; - area->page_order = page_order;
for (i = 0; i < area->nr_pages; i += 1U << page_order) { struct page *page; @@ -2657,27 +2658,17 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, if (!size || (size >> PAGE_SHIFT) > totalram_pages) goto fail;
- if (vmap_allow_huge && (pgprot_val(prot) == pgprot_val(PAGE_KERNEL))) { - unsigned long size_per_node; - + if (vmap_allow_huge && (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) && is_vmalloc_huge(vm_flags)) { /* - * Try huge pages. Only try for PAGE_KERNEL allocations, - * others like modules don't yet expect huge pages in - * their allocations due to apply_to_page_range not - * supporting them. + * Alloc huge pages. Only valid for PAGE_KERNEL allocations and + * VM_HUGE_PAGES flags. */
- size_per_node = size; - if (node == NUMA_NO_NODE && !sp_is_enabled()) - size_per_node /= num_online_nodes(); - if (size_per_node >= PMD_SIZE) { - shift = PMD_SHIFT; - align = max(real_align, 1UL << shift); - size = ALIGN(real_size, 1UL << shift); - } + shift = PMD_SHIFT; + align = max(real_align, 1UL << shift); + size = ALIGN(real_size, 1UL << shift); }
-again: size = PAGE_ALIGN(size); area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED | vm_flags, start, end, node, gfp_mask, caller); @@ -2706,12 +2697,6 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, return addr;
fail: - if (shift > PAGE_SHIFT) { - shift = PAGE_SHIFT; - align = real_align; - size = real_size; - goto again; - }
if (!area) { /* Warn for area allocation, page allocations already warn */ @@ -3776,7 +3761,7 @@ static int s_show(struct seq_file *m, void *p) seq_printf(m, " %pS", v->caller);
if (v->nr_pages) - seq_printf(m, " pages=%d order=%d", v->nr_pages, v->page_order); + seq_printf(m, " pages=%d", v->nr_pages);
if (v->phys_addr) seq_printf(m, " phys=%pa", &v->phys_addr); @@ -3796,8 +3781,8 @@ static int s_show(struct seq_file *m, void *p) if (is_vmalloc_addr(v->pages)) seq_puts(m, " vpages");
- if (sp_is_enabled()) - seq_printf(m, " order=%d", v->page_order); + if (is_vmalloc_huge(v->flags)) + seq_printf(m, " order=%d", PMD_SHIFT - PAGE_SHIFT);
show_numa_info(m, v); seq_putc(m, '\n');
From: Ye Bin yebin10@huawei.com
hulk inclusion category: bugfix bugzilla: 46833 CVE: NA
-----------------------------------------------
5d2a6c41d410 patch may lead to clear read-only flag when re-read partition table.
This reverts commit 5d2a6c41d410ed27797c953c57b0ccad3f0cb636.
Fixes:5d2a6c41d410("scsi: sd: block: Fix read-only flag residuals when partition table change") Signed-off-by: Ye Bin yebin10@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- block/partition-generic.c | 1 - 1 file changed, 1 deletion(-)
diff --git a/block/partition-generic.c b/block/partition-generic.c index b27ed20d3db4e..63b82df5bbb40 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -286,7 +286,6 @@ void delete_partition(struct gendisk *disk, int partno) if (!part) return;
- clear_bit(partno, disk->user_ro_bitmap); get_device(disk_to_dev(disk)); rcu_assign_pointer(ptbl->part[partno], NULL);
From: Ming Lei ming.lei@redhat.com
mainline inclusion from mainline-v5.8-rc7 commit 3f0dcfbcd2e162fc0a11c1f59b7acd42ee45f126 category: bugfix bugzilla: 47875 CVE: NA
-------------------------------------------------
I/O requests may be held in scheduler queue because of resource contention. The starvation scenario was handled properly in the regular completion path but we failed to account for it during I/O submission. This lead to the hang captured below. Make sure we run the queue when resource contention is encountered in the submission path.
[ 39.054963] scsi 13:0:0:0: rejecting I/O to dead device [ 39.058700] scsi 13:0:0:0: rejecting I/O to dead device [ 39.087855] sd 13:0:0:1: [sdd] Synchronizing SCSI cache [ 39.088909] scsi 13:0:0:1: rejecting I/O to dead device [ 39.095351] scsi 13:0:0:1: rejecting I/O to dead device [ 39.096962] scsi 13:0:0:1: rejecting I/O to dead device [ 247.021859] INFO: task scsi-stress-rem:813 blocked for more than 122 seconds. [ 247.023258] Not tainted 5.8.0-rc2 #8 [ 247.024069] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 247.025331] scsi-stress-rem D 0 813 802 0x00004000 [ 247.025334] Call Trace: [ 247.025354] __schedule+0x504/0x55f [ 247.027987] schedule+0x72/0xa8 [ 247.027991] blk_mq_freeze_queue_wait+0x63/0x8c [ 247.027994] ? do_wait_intr_irq+0x7a/0x7a [ 247.027996] blk_cleanup_queue+0x4b/0xc9 [ 247.028000] __scsi_remove_device+0xf6/0x14e [ 247.028002] scsi_remove_device+0x21/0x2b [ 247.029037] sdev_store_delete+0x58/0x7c [ 247.029041] kernfs_fop_write+0x10d/0x14f [ 247.031281] vfs_write+0xa2/0xdf [ 247.032670] ksys_write+0x6b/0xb3 [ 247.032673] do_syscall_64+0x56/0x82 [ 247.034053] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 247.034059] RIP: 0033:0x7f69f39e9008 [ 247.036330] Code: Bad RIP value. [ 247.036331] RSP: 002b:00007ffdd8116498 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [ 247.037613] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007f69f39e9008 [ 247.039714] RDX: 0000000000000002 RSI: 000055cde92a0ab0 RDI: 0000000000000001 [ 247.039715] RBP: 000055cde92a0ab0 R08: 000000000000000a R09: 00007f69f3a79e80 [ 247.039716] R10: 000000000000000a R11: 0000000000000246 R12: 00007f69f3abb780 [ 247.039717] R13: 0000000000000002 R14: 00007f69f3ab6740 R15: 0000000000000002
Link: https://lore.kernel.org/r/20200720025435.812030-1-ming.lei@redhat.com Cc: linux-block@vger.kernel.org Cc: Christoph Hellwig hch@lst.de Reviewed-by: Bart Van Assche bvanassche@acm.org Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Ming Lei ming.lei@redhat.com Signed-off-by: Martin K. Petersen martin.petersen@oracle.com Conflict: drivers/scsi/scsi_lib.c [Yufen: compatible with commit 44ea147b2756 ("SCSI: fix queue cleanup race before scsi_requeue_run_queue is done")] Signed-off-by: Yufen Yu yuyufen@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/scsi/scsi_lib.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-)
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 3d9f0e1c1c7e9..8f656ce3588d2 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -685,6 +685,21 @@ static void scsi_release_bidi_buffers(struct scsi_cmnd *cmd) cmd->request->next_rq->special = NULL; }
+static void scsi_run_queue_async(struct scsi_device *sdev) +{ + struct request_queue *q = sdev->request_queue; + + percpu_ref_get(&q->q_usage_counter); + if (scsi_target(sdev)->single_lun || + !list_empty(&sdev->host->starved_list)) { + if (!kblockd_schedule_work(&sdev->requeue_work)) + percpu_ref_put(&q->q_usage_counter); + } else { + blk_mq_run_hw_queues(q, true); + percpu_ref_put(&q->q_usage_counter); + } +} + /* Returns false when no more bytes to process, true if there are more */ static bool scsi_end_request(struct request *req, blk_status_t error, unsigned int bytes, unsigned int bidi_bytes) @@ -735,14 +750,9 @@ static bool scsi_end_request(struct request *req, blk_status_t error,
__blk_mq_end_request(req, error);
- if (scsi_target(sdev)->single_lun || - !list_empty(&sdev->host->starved_list)) { - if (!kblockd_schedule_work(&sdev->requeue_work)) - percpu_ref_put(&q->q_usage_counter); - } else { - blk_mq_run_hw_queues(q, true); - percpu_ref_put(&q->q_usage_counter); - } + scsi_run_queue_async(sdev); + + percpu_ref_put(&q->q_usage_counter); } else { unsigned long flags;
@@ -2206,6 +2216,7 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, */ if (req->rq_flags & RQF_DONTPREP) scsi_mq_uninit_cmd(cmd); + scsi_run_queue_async(sdev); break; } return ret;
From: Ming Lei ming.lei@redhat.com
mainline inclusion from mainline-v5.10-rc1 commit ed5dd6a67d5eac5fb8873697b55dc1699752a9f3 category: bugfix bugzilla: 47875 CVE: NA
-------------------------------------------------
The request queue is currently run unconditionally in scsi_end_request() if both target queue and host queue are ready.
Recently Long Li reported that cost of a queue run can be very heavy in case of high queue depth. Improve this situation by only running the request queue when this LUN is busy.
Link: https://lore.kernel.org/r/20200910075056.36509-1-ming.lei@redhat.com Reported-by: Long Li longli@microsoft.com Tested-by: Long Li longli@microsoft.com Tested-by: Kashyap Desai kashyap.desai@broadcom.com Reviewed-by: Bart Van Assche bvanassche@acm.org Reviewed-by: Hannes Reinecke hare@suse.de Reviewed-by: Ewan D. Milne emilne@redhat.com Reviewed-by: John Garry john.garry@huawei.com Signed-off-by: Ming Lei ming.lei@redhat.com Signed-off-by: Martin K. Petersen martin.petersen@oracle.com Conflict: drivers/scsi/scsi_lib.c include/scsi/scsi_device.h Signed-off-by: Yufen Yu yuyufen@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/scsi/scsi_lib.c | 34 +++++++++++++++++++++++++++++++++- include/scsi/scsi_device.h | 1 + 2 files changed, 34 insertions(+), 1 deletion(-)
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 8f656ce3588d2..1dedad873955d 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -695,7 +695,23 @@ static void scsi_run_queue_async(struct scsi_device *sdev) if (!kblockd_schedule_work(&sdev->requeue_work)) percpu_ref_put(&q->q_usage_counter); } else { - blk_mq_run_hw_queues(q, true); + /* + * smp_mb() present in sbitmap_queue_clear() or implied in + * .end_io is for ordering writing .device_busy in + * scsi_device_unbusy() and reading sdev->restarts. + */ + int old = atomic_read(&sdev->restarts); + + /* + * ->restarts has to be kept as non-zero if new budget + * contention occurs. + * + * No need to run queue when either another re-run + * queue wins in updating ->restarts or a new budget + * contention occurs. + */ + if (old && atomic_cmpxchg(&sdev->restarts, old, 0) == old) + blk_mq_run_hw_queues(sdev->request_queue, true); percpu_ref_put(&q->q_usage_counter); } } @@ -2136,7 +2152,23 @@ static bool scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx)
out_put_device: put_device(&sdev->sdev_gendev); + atomic_inc(&sdev->restarts); + + /* + * Orders atomic_inc(&sdev->restarts) and atomic_read(&sdev->device_busy). + * .restarts must be incremented before .device_busy is read because the + * code in scsi_run_queue_async() depends on the order of these operations. + */ + smp_mb__after_atomic(); out: + /* + * If all in-flight requests originated from this LUN are completed + * before reading .device_busy, sdev->device_busy will be observed as + * zero, then blk_mq_delay_run_hw_queues() will dispatch this request + * soon. Otherwise, completion of one of these requests will observe + * the .restarts flag, and the request queue will be run for handling + * this request, see scsi_end_request(). + */ if (atomic_read(&sdev->device_busy) == 0 && !scsi_device_blocked(sdev)) blk_mq_delay_run_hw_queue(hctx, SCSI_QUEUE_DELAY); return false; diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index 52b255e868a90..32920cb8a020f 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -110,6 +110,7 @@ struct scsi_device { atomic_t device_busy; /* commands actually active on LLDD */ atomic_t device_blocked; /* Device returned QUEUE_FULL. */
+ atomic_t restarts; spinlock_t list_lock; struct list_head cmd_list; /* queue of in use SCSI Command structures */ struct list_head starved_entry;
From: Ye Bin yebin10@huawei.com
This reverts commit 4f44b5030406002fc0b7d31b02c2942bcc0a6ec7.
hulk inclusion category: bugfix bugzilla: 49978 CVE: NA
-----------------------------------------------
We got follow error: 2021/02/26 10:15:49 parsed 1 programs 2021/02/26 10:15:49 executed programs: 0
Message from syslogd@localhost at Feb 26 10:15:52 ... kernel:[ 710.135641] page:ffff7e000309e600 count:-1 mapcount:0 mapping:0000000000000000 index:0x0
Message from syslogd@localhost at Feb 26 10:15:52 ... kernel:[ 710.136201] flags: 0xffffe0000000000()
As in sg_remove_scat will judge schp->k_use_sg then free pages. But in sg_build_indirect if (rem_sz > 0) we free pages without clean schp->k_use_sg or set schp->pages[i] with NULL. So it will lead to free in sg_remove_scat again.
Fixes: 4f44b5030406("scsi: sg: fix memory leak in sg_build_indirect") Signed-off-by: Ye Bin yebin10@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/scsi/sg.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-)
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 749faafbc9770..10da329fa53fb 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1942,12 +1942,8 @@ sg_build_indirect(Sg_scatter_hold * schp, Sg_fd * sfp, int buff_size) k, rem_sz));
schp->bufflen = blk_size; - if (rem_sz > 0) { /* must have failed */ - for (i = 0; i < k; i++) - __free_pages(schp->pages[i], order); - + if (rem_sz > 0) /* must have failed */ return -ENOMEM; - } return 0; out: for (i = 0; i < k; i++)
From: Luo Meng luomeng12@huawei.com
hulk inclusion category: bugfix bugzilla: NA CVE: NA
-----------------------------------------------
This reverts commit 1f35007d8de64bc04327576a8f5818cc3446a305.
Signed-off-by: Luo Meng luomeng12@huawei.com Reviewed-by: zhangyi (F) yi.zhang@huawei.com Reviewed-by: Zhang Xiaoxu zhangxiaoxu5@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/shmem_fs.h | 2 -- mm/shmem.c | 41 +--------------------------------------- 2 files changed, 1 insertion(+), 42 deletions(-)
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 559cd51da3a4d..f155dc607112e 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -38,8 +38,6 @@ struct shmem_sb_info { spinlock_t shrinklist_lock; /* Protects shrinklist */ struct list_head shrinklist; /* List of shinkable inodes */ unsigned long shrinklist_len; /* Length of shrinklist */ - unsigned long __percpu *last_ino_number; /* Last inode number */ - atomic64_t shared_last_ino_number; /* Shared last inode number */ };
static inline struct shmem_inode_info *SHMEM_I(struct inode *inode) diff --git a/mm/shmem.c b/mm/shmem.c index e31b490ca095f..98939e57f6cc6 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2212,40 +2212,6 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) return 0; }
-#define SHMEM_LAST_INO_BATCH 1024 - -static unsigned long shmem_get_next_ino(struct shmem_sb_info *sbinfo) -{ - unsigned long *p; - unsigned long res; - int cpu; - - cpu = get_cpu(); - p = per_cpu_ptr(sbinfo->last_ino_number, cpu); - res = *p; - -#ifdef CONFIG_SMP - if (unlikely((res & (SHMEM_LAST_INO_BATCH-1)) == 0)) { - /* - * If OS is 32-bit, next will be truncated(Cause - * inode->i_ino is unsigned long, define next to long.) - */ - long next = atomic64_add_return(SHMEM_LAST_INO_BATCH, - &sbinfo->shared_last_ino_number); - - res = next - SHMEM_LAST_INO_BATCH; - } -#endif - - res++; - /* Avoid 0 in the low 32 bits: might appear deleted */ - if (unlikely(!(unsigned int)res)) - res++; - *p = res; - put_cpu(); - return res; -} - static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir, umode_t mode, dev_t dev, unsigned long flags) { @@ -2258,7 +2224,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
inode = new_inode(sb); if (inode) { - inode->i_ino = shmem_get_next_ino(sbinfo); + inode->i_ino = get_next_ino(); inode_init_owner(inode, dir, mode); inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); @@ -3573,7 +3539,6 @@ static void shmem_put_super(struct super_block *sb) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
- free_percpu(sbinfo->last_ino_number); percpu_counter_destroy(&sbinfo->used_blocks); mpol_put(sbinfo->mpol); kfree(sbinfo); @@ -3622,10 +3587,6 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) spin_lock_init(&sbinfo->stat_lock); if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) goto failed; - sbinfo->last_ino_number = alloc_percpu(unsigned long); - if (!sbinfo->last_ino_number) - goto failed; - atomic64_set(&sbinfo->shared_last_ino_number, 0); sbinfo->free_inodes = sbinfo->max_inodes; spin_lock_init(&sbinfo->shrinklist_lock); INIT_LIST_HEAD(&sbinfo->shrinklist);
From: Chris Down chris@chrisdown.name
mainline inclusion from mainline-v5.9-rc1 commit e809d5f0b5c912fe981dce738f3283b2010665f0 category: bugfix bugzilla: NA CVE: NA ---------------------------
Patch series "tmpfs: inode: Reduce risk of inum overflow", v7.
In Facebook production we are seeing heavy i_ino wraparounds on tmpfs. On affected tiers, in excess of 10% of hosts show multiple files with different content and the same inode number, with some servers even having as many as 150 duplicated inode numbers with differing file content.
This causes actual, tangible problems in production. For example, we have complaints from those working on remote caches that their application is reporting cache corruptions because it uses (device, inodenum) to establish the identity of a particular cache object, but because it's not unique any more, the application refuses to continue and reports cache corruption. Even worse, sometimes applications may not even detect the corruption but may continue anyway, causing phantom and hard to debug behaviour.
In general, userspace applications expect that (device, inodenum) should be enough to be uniquely point to one inode, which seems fair enough. One might also need to check the generation, but in this case:
1. That's not currently exposed to userspace (ioctl(...FS_IOC_GETVERSION...) returns ENOTTY on tmpfs); 2. Even with generation, there shouldn't be two live inodes with the same inode number on one device.
In order to mitigate this, we take a two-pronged approach:
1. Moving inum generation from being global to per-sb for tmpfs. This itself allows some reduction in i_ino churn. This works on both 64- and 32- bit machines. 2. Adding inode{64,32} for tmpfs. This fix is supported on machines with 64-bit ino_t only: we allow users to mount tmpfs with a new inode64 option that uses the full width of ino_t, or CONFIG_TMPFS_INODE64.
You can see how this compares to previous related patches which didn't implement this per-superblock:
- https://patchwork.kernel.org/patch/11254001/ - https://patchwork.kernel.org/patch/11023915/
This patch (of 2):
get_next_ino has a number of problems:
- It uses and returns a uint, which is susceptible to become overflowed if a lot of volatile inodes that use get_next_ino are created. - It's global, with no specificity per-sb or even per-filesystem. This means it's not that difficult to cause inode number wraparounds on a single device, which can result in having multiple distinct inodes with the same inode number.
This patch adds a per-superblock counter that mitigates the second case. This design also allows us to later have a specific i_ino size per-device, for example, allowing users to choose whether to use 32- or 64-bit inodes for each tmpfs mount. This is implemented in the next commit.
For internal shmem mounts which may be less tolerant to spinlock delays, we implement a percpu batching scheme which only takes the stat_lock at each batch boundary.
Signed-off-by: Chris Down chris@chrisdown.name Signed-off-by: Andrew Morton akpm@linux-foundation.org Acked-by: Hugh Dickins hughd@google.com Cc: Amir Goldstein amir73il@gmail.com Cc: Al Viro viro@zeniv.linux.org.uk Cc: Matthew Wilcox willy@infradead.org Cc: Jeff Layton jlayton@kernel.org Cc: Johannes Weiner hannes@cmpxchg.org Cc: Tejun Heo tj@kernel.org Link: http://lkml.kernel.org/r/cover.1594661218.git.chris@chrisdown.name Link: http://lkml.kernel.org/r/1986b9d63b986f08ec07a4aa4b2275e718e47d8a.1594661218... Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Luo Meng luomeng12@huawei.com
Conflicts: mm/shmem.c Reviewed-by: zhangyi (F) yi.zhang@huawei.com Reviewed-by: Zhang Xiaoxu zhangxiaoxu5@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/fs.h | 15 +++++++++ include/linux/shmem_fs.h | 2 ++ mm/shmem.c | 66 +++++++++++++++++++++++++++++++++++++--- 3 files changed, 78 insertions(+), 5 deletions(-)
diff --git a/include/linux/fs.h b/include/linux/fs.h index 8d5ee697727cd..46eb1d540606b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3007,6 +3007,21 @@ extern void discard_new_inode(struct inode *); extern unsigned int get_next_ino(void); extern void evict_inodes(struct super_block *sb);
+/* + * Userspace may rely on the the inode number being non-zero. For example, glibc + * simply ignores files with zero i_ino in unlink() and other places. + * + * As an additional complication, if userspace was compiled with + * _FILE_OFFSET_BITS=32 on a 64-bit kernel we'll only end up reading out the + * lower 32 bits, so we need to check that those aren't zero explicitly. With + * _FILE_OFFSET_BITS=64, this may cause some harmless false-negatives, but + * better safe than sorry. + */ +static inline bool is_zero_ino(ino_t ino) +{ + return (u32)ino == 0; +} + extern void __iget(struct inode * inode); extern void iget_failed(struct inode *); extern void clear_inode(struct inode *); diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index f155dc607112e..dd67c6e59ceaa 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -34,6 +34,8 @@ struct shmem_sb_info { unsigned char huge; /* Whether to try for hugepages */ kuid_t uid; /* Mount uid for root directory */ kgid_t gid; /* Mount gid for root directory */ + ino_t next_ino; /* The next per-sb inode number to use */ + ino_t __percpu *ino_batch; /* The next per-cpu inode number to use */ struct mempolicy *mpol; /* default memory policy for mappings */ spinlock_t shrinklist_lock; /* Protects shrinklist */ struct list_head shrinklist; /* List of shinkable inodes */ diff --git a/mm/shmem.c b/mm/shmem.c index 98939e57f6cc6..25205b46b053f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -267,18 +267,67 @@ bool vma_is_shmem(struct vm_area_struct *vma) static LIST_HEAD(shmem_swaplist); static DEFINE_MUTEX(shmem_swaplist_mutex);
-static int shmem_reserve_inode(struct super_block *sb) +/* + * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and + * produces a novel ino for the newly allocated inode. + * + * It may also be called when making a hard link to permit the space needed by + * each dentry. However, in that case, no new inode number is needed since that + * internally draws from another pool of inode numbers (currently global + * get_next_ino()). This case is indicated by passing NULL as inop. + */ +#define SHMEM_INO_BATCH 1024 +static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); - if (sbinfo->max_inodes) { + ino_t ino; + + if (!(sb->s_flags & SB_KERNMOUNT)) { spin_lock(&sbinfo->stat_lock); if (!sbinfo->free_inodes) { spin_unlock(&sbinfo->stat_lock); return -ENOSPC; } sbinfo->free_inodes--; + if (inop) { + ino = sbinfo->next_ino++; + if (unlikely(is_zero_ino(ino))) + ino = sbinfo->next_ino++; + if (unlikely(ino > UINT_MAX)) { + /* + * Emulate get_next_ino uint wraparound for + * compatibility + */ + ino = 1; + } + *inop = ino; + } spin_unlock(&sbinfo->stat_lock); + } else if (inop) { + /* + * __shmem_file_setup, one of our callers, is lock-free: it + * doesn't hold stat_lock in shmem_reserve_inode since + * max_inodes is always 0, and is called from potentially + * unknown contexts. As such, use a per-cpu batched allocator + * which doesn't require the per-sb stat_lock unless we are at + * the batch boundary. + */ + ino_t *next_ino; + next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu()); + ino = *next_ino; + if (unlikely(ino % SHMEM_INO_BATCH == 0)) { + spin_lock(&sbinfo->stat_lock); + ino = sbinfo->next_ino; + sbinfo->next_ino += SHMEM_INO_BATCH; + spin_unlock(&sbinfo->stat_lock); + if (unlikely(is_zero_ino(ino))) + ino++; + } + *inop = ino; + *next_ino = ++ino; + put_cpu(); } + return 0; }
@@ -2218,13 +2267,14 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode struct inode *inode; struct shmem_inode_info *info; struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + ino_t ino;
- if (shmem_reserve_inode(sb)) + if (shmem_reserve_inode(sb, &ino)) return NULL;
inode = new_inode(sb); if (inode) { - inode->i_ino = get_next_ino(); + inode->i_ino = ino; inode_init_owner(inode, dir, mode); inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); @@ -2938,7 +2988,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr * first link must skip that, to get the accounting right. */ if (inode->i_nlink) { - ret = shmem_reserve_inode(inode->i_sb); + ret = shmem_reserve_inode(inode->i_sb, NULL); if (ret) goto out; } @@ -3539,6 +3589,7 @@ static void shmem_put_super(struct super_block *sb) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+ free_percpu(sbinfo->ino_batch); percpu_counter_destroy(&sbinfo->used_blocks); mpol_put(sbinfo->mpol); kfree(sbinfo); @@ -3583,6 +3634,11 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) #else sb->s_flags |= SB_NOUSER; #endif + if (sb->s_flags & SB_KERNMOUNT) { + sbinfo->ino_batch = alloc_percpu(ino_t); + if (!sbinfo->ino_batch) + goto failed; + }
spin_lock_init(&sbinfo->stat_lock); if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
From: Chris Down chris@chrisdown.name
mainline inclusion from mainline-v5.9-rc1 commit ea3271f7196c65ae5d3e1c7b3f733892c017dbd6 category: bugfix bugzilla: NA CVE: NA ---------------------------
The default is still set to inode32 for backwards compatibility, but system administrators can opt in to the new 64-bit inode numbers by either:
1. Passing inode64 on the command line when mounting, or 2. Configuring the kernel with CONFIG_TMPFS_INODE64=y
The inode64 and inode32 names are used based on existing precedent from XFS.
[hughd@google.com: Kconfig fixes] Link: http://lkml.kernel.org/r/alpine.LSU.2.11.2008011928010.13320@eggly.anvils
Signed-off-by: Chris Down chris@chrisdown.name Signed-off-by: Hugh Dickins hughd@google.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Reviewed-by: Amir Goldstein amir73il@gmail.com Acked-by: Hugh Dickins hughd@google.com Cc: Al Viro viro@zeniv.linux.org.uk Cc: Matthew Wilcox willy@infradead.org Cc: Jeff Layton jlayton@kernel.org Cc: Johannes Weiner hannes@cmpxchg.org Cc: Tejun Heo tj@kernel.org Link: http://lkml.kernel.org/r/8b23758d0c66b5e2263e08baf9c4b6a7565cbd8f.1594661218... Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Reviewed-by: zhangyi (F) yi.zhang@huawei.com Reviewed-by: Zhang Xiaoxu zhangxiaoxu5@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- Documentation/filesystems/tmpfs.txt | 17 ++++++++ fs/Kconfig | 21 ++++++++++ include/linux/shmem_fs.h | 1 + mm/shmem.c | 61 ++++++++++++++++++++++++++--- 4 files changed, 95 insertions(+), 5 deletions(-)
diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt index d06e9a59a9f4a..c72536fa78ae5 100644 --- a/Documentation/filesystems/tmpfs.txt +++ b/Documentation/filesystems/tmpfs.txt @@ -135,6 +135,21 @@ gid: The group id These options do not have any effect on remount. You can change these parameters with chmod(1), chown(1) and chgrp(1) on a mounted filesystem.
+tmpfs has a mount option to select whether it will wrap at 32- or 64-bit inode +numbers: + +======= ======================== +inode64 Use 64-bit inode numbers +inode32 Use 32-bit inode numbers +======= ======================== + +On a 32-bit kernel, inode32 is implicit, and inode64 is refused at mount time. +On a 64-bit kernel, CONFIG_TMPFS_INODE64 sets the default. inode64 avoids the +possibility of multiple files with the same inode number on a single device; +but risks glibc failing with EOVERFLOW once 33-bit inode numbers are reached - +if a long-lived tmpfs is accessed by 32-bit applications so ancient that +opening a file larger than 2GiB fails with EINVAL. +
So 'mount -t tmpfs -o size=10G,nr_inodes=10k,mode=700 tmpfs /mytmpfs' will give you tmpfs instance on /mytmpfs which can allocate 10GB @@ -147,3 +162,5 @@ Updated: Hugh Dickins, 4 June 2007 Updated: KOSAKI Motohiro, 16 Mar 2010 +Updated: + Chris Down, 13 July 2020 diff --git a/fs/Kconfig b/fs/Kconfig index 3fa013a399815..2d9d472d8ba84 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -190,6 +190,27 @@ config TMPFS_XATTR
If unsure, say N.
+config TMPFS_INODE64 + bool "Use 64-bit ino_t by default in tmpfs" + depends on TMPFS && 64BIT + default n + help + tmpfs has historically used only inode numbers as wide as an unsigned + int. In some cases this can cause wraparound, potentially resulting + in multiple files with the same inode number on a single device. This + option makes tmpfs use the full width of ino_t by default, without + needing to specify the inode64 option when mounting. + + But if a long-lived tmpfs is to be accessed by 32-bit applications so + ancient that opening a file larger than 2GiB fails with EINVAL, then + the INODE64 config option and inode64 mount option risk operations + failing with EOVERFLOW once 33-bit inode numbers are reached. + + To override this configured default, use the inode32 or inode64 + option when mounting. + + If unsure, say N. + config HUGETLBFS bool "HugeTLB file system support" depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \ diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index dd67c6e59ceaa..401afcf81efea 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -34,6 +34,7 @@ struct shmem_sb_info { unsigned char huge; /* Whether to try for hugepages */ kuid_t uid; /* Mount uid for root directory */ kgid_t gid; /* Mount gid for root directory */ + bool full_inums; /* If i_ino should be uint or ino_t */ ino_t next_ino; /* The next per-sb inode number to use */ ino_t __percpu *ino_batch; /* The next per-cpu inode number to use */ struct mempolicy *mpol; /* default memory policy for mappings */ diff --git a/mm/shmem.c b/mm/shmem.c index 25205b46b053f..ad5ca9ed943e8 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -293,12 +293,17 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) ino = sbinfo->next_ino++; if (unlikely(is_zero_ino(ino))) ino = sbinfo->next_ino++; - if (unlikely(ino > UINT_MAX)) { + if (unlikely(!sbinfo->full_inums && + ino > UINT_MAX)) { /* * Emulate get_next_ino uint wraparound for * compatibility */ - ino = 1; + if (IS_ENABLED(CONFIG_64BIT)) + pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n", + __func__, MINOR(sb->s_dev)); + sbinfo->next_ino = 1; + ino = sbinfo->next_ino++; } *inop = ino; } @@ -311,6 +316,10 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) * unknown contexts. As such, use a per-cpu batched allocator * which doesn't require the per-sb stat_lock unless we are at * the batch boundary. + * + * We don't need to worry about inode{32,64} since SB_KERNMOUNT + * shmem mounts are not exposed to userspace, so we don't need + * to worry about things like glibc compatibility. */ ino_t *next_ino; next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu()); @@ -3427,9 +3436,12 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, if ((value = strchr(this_char,'=')) != NULL) { *value++ = 0; } else { - pr_err("tmpfs: No value for mount option '%s'\n", - this_char); - goto error; + if (strcmp(this_char,"inode32") && + strcmp(this_char,"inode64")) { + pr_err("tmpfs: No value for mount option '%s'\n", + this_char); + goto error; + } }
if (!strcmp(this_char,"size")) { @@ -3495,6 +3507,14 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, if (mpol_parse_str(value, &mpol)) goto bad_val; #endif + } else if (!strcmp(this_char,"inode32")) { + sbinfo->full_inums = false; + } else if (!strcmp(this_char,"inode64")) { + if (sizeof(ino_t) < 8) { + pr_err("tmpfs: Cannot use inode64 with <64bit inums in kernel\n"); + goto error; + } + sbinfo->full_inums = true; } else { pr_err("tmpfs: Bad mount option %s\n", this_char); goto error; @@ -3539,8 +3559,15 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) if (config.max_inodes && !sbinfo->max_inodes) goto out;
+ if (sbinfo->full_inums && !config.full_inums && + sbinfo->next_ino > UINT_MAX) { + pr_err("tmpfs: Current inum too high to switch to 32-bit inums\n"); + goto out; + } + error = 0; sbinfo->huge = config.huge; + sbinfo->full_inums = config.full_inums; sbinfo->max_blocks = config.max_blocks; sbinfo->max_inodes = config.max_inodes; sbinfo->free_inodes = config.max_inodes - inodes; @@ -3574,6 +3601,29 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, sbinfo->gid)); + + /* + * Showing inode{64,32} might be useful even if it's the system default, + * since then people don't have to resort to checking both here and + * /proc/config.gz to confirm 64-bit inums were successfully applied + * (which may not even exist if IKCONFIG_PROC isn't enabled). + * + * We hide it when inode64 isn't the default and we are using 32-bit + * inodes, since that probably just means the feature isn't even under + * consideration. + * + * As such: + * + * +-----------------+-----------------+ + * | TMPFS_INODE64=y | TMPFS_INODE64=n | + * +------------------+-----------------+-----------------+ + * | full_inums=true | show | show | + * | full_inums=false | show | hide | + * +------------------+-----------------+-----------------+ + * + */ + if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums) + seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32)); #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */ if (sbinfo->huge) @@ -3622,6 +3672,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) if (!(sb->s_flags & SB_KERNMOUNT)) { sbinfo->max_blocks = shmem_default_max_blocks(); sbinfo->max_inodes = shmem_default_max_inodes(); + sbinfo->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64); if (shmem_parse_options(data, sbinfo, false)) { err = -EINVAL; goto failed;
From: Byron Stanoszek gandalf@winds.org
mainline inclusion from mainline-v5.9-rc6 commit bb3e96d63eb75a2f4ff790b089f6b93614c729a1 category: bugfix bugzilla: NA CVE: NA ---------------------------
Commit e809d5f0b5c9 ("tmpfs: per-superblock i_ino support") made changes to shmem_reserve_inode() in mm/shmem.c, however the original test for (sbinfo->max_inodes) got dropped. This causes mounting tmpfs with option nr_inodes=0 to fail:
# mount -ttmpfs -onr_inodes=0 none /ext0 mount: /ext0: mount(2) system call failed: Cannot allocate memory.
This patch restores the nr_inodes=0 functionality.
Fixes: e809d5f0b5c9 ("tmpfs: per-superblock i_ino support") Signed-off-by: Byron Stanoszek gandalf@winds.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Acked-by: Hugh Dickins hughd@google.com Acked-by: Chris Down chris@chrisdown.name Link: https://lkml.kernel.org/r/20200902035715.16414-1-gandalf@winds.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Luo Meng luomeng12@huawei.com Reviewed-by: zhangyi (F) yi.zhang@huawei.com Reviewed-by: Zhang Xiaoxu zhangxiaoxu5@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/shmem.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/mm/shmem.c b/mm/shmem.c index ad5ca9ed943e8..b4be0be77327c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -284,11 +284,13 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
if (!(sb->s_flags & SB_KERNMOUNT)) { spin_lock(&sbinfo->stat_lock); - if (!sbinfo->free_inodes) { - spin_unlock(&sbinfo->stat_lock); - return -ENOSPC; + if (sbinfo->max_inodes) { + if (!sbinfo->free_inodes) { + spin_unlock(&sbinfo->stat_lock); + return -ENOSPC; + } + sbinfo->free_inodes--; } - sbinfo->free_inodes--; if (inop) { ino = sbinfo->next_ino++; if (unlikely(is_zero_ino(ino)))
hulk inclusion category: bugfix bugzilla: NA CVE: NA
-----------------------------------------------
disable config TMPFS_INODE64 by default
Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: zhangyi (F) yi.zhang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/configs/hulk_defconfig | 1 + 1 file changed, 1 insertion(+)
diff --git a/arch/arm64/configs/hulk_defconfig b/arch/arm64/configs/hulk_defconfig index 09048942eb0ae..f8f7890254641 100644 --- a/arch/arm64/configs/hulk_defconfig +++ b/arch/arm64/configs/hulk_defconfig @@ -4918,6 +4918,7 @@ CONFIG_SYSFS=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y CONFIG_TMPFS_XATTR=y +# CONFIG_TMPFS_INODE64 is not set CONFIG_HUGETLBFS=y CONFIG_HUGETLB_PAGE=y CONFIG_MEMFD_CREATE=y
From: "zhangyi (F)" yi.zhang@huawei.com
hulk inclusion category: bugfix bugzilla: 49893 CVE: NA ---------------------------
If we failed to add new entry on rename whiteout, we cannot reset the old->de entry directly, because the old->de could have moved from under us during make indexed dir. So find the old entry again before reset is needed, otherwise it may corrupt the filesystem.
Fixes: 21401081d4e ("ext4: fix bug for rename with RENAME_WHITEOUT") Signed-off-by: zhangyi (F) yi.zhang@huawei.com Reviewed-by: Yang Erkun yangerkun@huawei.com Reviewed-by: Ye bin yebin10@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/ext4/namei.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 558d8d9b8b224..a8face65e5fd4 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3464,6 +3464,31 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent, return 0; }
+static void ext4_resetent(handle_t *handle, struct ext4_renament *ent, + unsigned ino, unsigned file_type) +{ + struct ext4_renament old = *ent; + int retval = 0; + + /* + * old->de could have moved from under us during make indexed dir, + * so the old->de may no longer valid and need to find it again + * before reset old inode info. + */ + old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL); + if (IS_ERR(old.bh)) + retval = PTR_ERR(old.bh); + if (!old.bh) + retval = -ENOENT; + if (retval) { + ext4_std_error(old.dir->i_sb, retval); + return; + } + + ext4_setent(handle, &old, ino, file_type); + brelse(old.bh); +} + static int ext4_find_delete_entry(handle_t *handle, struct inode *dir, const struct qstr *d_name) { @@ -3760,8 +3785,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, end_rename: if (whiteout) { if (retval) { - ext4_setent(handle, &old, - old.inode->i_ino, old_file_type); + ext4_resetent(handle, &old, + old.inode->i_ino, old_file_type); drop_nlink(whiteout); } unlock_new_inode(whiteout);
From: Yonglong Liu liuyonglong@huawei.com
driver inclusion category: feature bugzilla: NA CVE: NA
----------------------------
This patch adds support for setting/getting pf max tx rate via sysfs:
echo <rate Mbit/s> > /sys/class/net/<eth name>/kunpeng/pf/max_tx_rate cat /sys/class/net/<eth name>/kunpeng/pf/max_tx_rate
Signed-off-by: Yonglong Liu liuyonglong@huawei.com Reviewed-by: li yongxin liyongxin1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/net/ethernet/hisilicon/hns3/Makefile | 3 +- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 8 + .../hns3_extension/hns3pf/hclge_main_it.c | 33 +++++ .../hns3/hns3_extension/hns3pf/hclge_sysfs.c | 140 ++++++++++++++++++ .../hns3/hns3_extension/hns3pf/hclge_sysfs.h | 14 ++ .../hisilicon/hns3/hns3pf/hclge_main.c | 17 +++ .../hisilicon/hns3/hns3pf/hclge_main.h | 5 + 7 files changed, 219 insertions(+), 1 deletion(-) create mode 100644 drivers/net/ethernet/hisilicon/hns3/hns3_extension/hns3pf/hclge_sysfs.c create mode 100644 drivers/net/ethernet/hisilicon/hns3/hns3_extension/hns3pf/hclge_sysfs.h
diff --git a/drivers/net/ethernet/hisilicon/hns3/Makefile b/drivers/net/ethernet/hisilicon/hns3/Makefile index bd607c0c40942..5efa685e871d3 100644 --- a/drivers/net/ethernet/hisilicon/hns3/Makefile +++ b/drivers/net/ethernet/hisilicon/hns3/Makefile @@ -38,7 +38,8 @@ HCLGE_OBJ = hns3pf/hclge_main.o \ hns3pf/hclge_err.o
-HCLGE_OBJ_IT_MAIN = hns3_extension/hns3pf/hclge_main_it.o +HCLGE_OBJ_IT_MAIN = hns3_extension/hns3pf/hclge_main_it.o \ + hns3_extension/hns3pf/hclge_sysfs.o obj-$(CONFIG_HNS3_HCLGE) += hclge.o hclge-objs := $(HCLGE_OBJ) $(HCLGE_OBJ_IT_MAIN) hclge-$(CONFIG_HNS3_DCB) += hns3pf/hclge_dcb.o diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index 3c56cbc9afa66..7b3938054e5f1 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -578,6 +578,9 @@ struct hnae3_ae_ops { int (*ecc_handle)(struct hnae3_ae_dev *ae_dev); int (*priv_ops)(struct hnae3_handle *handle, int opcode, void *data, int length); + void (*ext_init)(struct hnae3_handle *handle); + void (*ext_uninit)(struct hnae3_handle *handle); + void (*ext_reset_done)(struct hnae3_handle *handle); #endif };
@@ -713,6 +716,11 @@ struct hnae3_handle {
/* Network interface message level enabled bits */ u32 msg_enable; + +#ifdef CONFIG_HNS3_TEST + /* for sysfs */ + struct kobject *kobj; +#endif };
#define hnae3_set_field(origin, mask, shift, val) \ diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_extension/hns3pf/hclge_main_it.c b/drivers/net/ethernet/hisilicon/hns3/hns3_extension/hns3pf/hclge_main_it.c index 01f0e19f1e505..59e287848ee36 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_extension/hns3pf/hclge_main_it.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_extension/hns3pf/hclge_main_it.c @@ -18,6 +18,9 @@ #include "hclge_main.h" #include "hnae3.h" #include "hclge_main_it.h" +#ifdef CONFIG_HNS3_TEST +#include "hclge_sysfs.h" +#endif
#ifdef CONFIG_IT_VALIDATION #define HCLGE_RESET_MAX_FAIL_CNT 1 @@ -174,8 +177,38 @@ bool hclge_reset_done_it(struct hnae3_handle *handle, bool done) return done; }
+#ifdef CONFIG_HNS3_TEST +void hclge_ext_init(struct hnae3_handle *handle) +{ + hclge_sysfs_init(handle); +} + +void hclge_ext_uninit(struct hnae3_handle *handle) +{ + struct hclge_vport *vport = hclge_get_vport(handle); + struct hclge_dev *hdev = vport->back; + + hclge_reset_pf_rate(hdev); + hclge_sysfs_uninit(handle); +} + +void hclge_ext_reset_done(struct hnae3_handle *handle) +{ + struct hclge_vport *vport = hclge_get_vport(handle); + struct hclge_dev *hdev = vport->back; + + hclge_resume_pf_rate(hdev); +} +#endif + int hclge_init_it(void) { +#ifdef CONFIG_HNS3_TEST + hclge_ops.ext_init = hclge_ext_init; + hclge_ops.ext_uninit = hclge_ext_uninit; + hclge_ops.ext_reset_done = hclge_ext_reset_done; +#endif + hclge_ops.reset_event = hclge_reset_event_it; hclge_ops.reset_done = hclge_reset_done_it; hclge_ops.handle_imp_error = hclge_handle_imp_error_it; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_extension/hns3pf/hclge_sysfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3_extension/hns3pf/hclge_sysfs.c new file mode 100644 index 0000000000000..86e60938013c2 --- /dev/null +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_extension/hns3pf/hclge_sysfs.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* Copyright (c) 2018-2021 Hisilicon Limited. */ + +#include <linux/device.h> +#include "hnae3.h" +#include "hclge_main.h" +#include "hclge_tm.h" +#include "hclge_sysfs.h" + +void hclge_reset_pf_rate(struct hclge_dev *hdev) +{ + struct hclge_vport *vport = &hdev->vport[0]; + int ret; + + /* zero means max rate, if max_tx_rate is zero, just return */ + if (!vport->vf_info.max_tx_rate) + return; + + vport->vf_info.max_tx_rate = 0; + + ret = hclge_tm_qs_shaper_cfg(vport, vport->vf_info.max_tx_rate); + if (ret) + dev_err(&hdev->pdev->dev, + "failed to reset pf tx rate to default, ret = %d.\n", + ret); +} + +int hclge_resume_pf_rate(struct hclge_dev *hdev) +{ + struct hclge_vport *vport = &hdev->vport[0]; + int ret; + + /* zero means max rate, after reset, firmware already set it to + * max rate, so just continue. + */ + if (!vport->vf_info.max_tx_rate) + return 0; + + ret = hclge_tm_qs_shaper_cfg(vport, vport->vf_info.max_tx_rate); + if (ret) { + dev_err(&hdev->pdev->dev, + "failed to resume pf tx rate:%u, ret = %d.\n", + vport->vf_info.max_tx_rate, ret); + return ret; + } + + return 0; +} + +static ssize_t hclge_max_tx_rate_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct hclge_vport *vport = + container_of(kobj, struct hclge_vport, kobj); + + return sprintf(buf, "%d Mbit/s (0 means no limit)\n", + vport->vf_info.max_tx_rate); +} + +static ssize_t hclge_max_tx_rate_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, + size_t size) +{ + struct hclge_vport *vport = + container_of(kobj, struct hclge_vport, kobj); + struct hclge_dev *hdev = vport->back; + int max_tx_rate; + int ret; + + ret = kstrtoint(buf, 0, &max_tx_rate); + if (ret) + return -EINVAL; + + if (max_tx_rate < 0 || max_tx_rate > hdev->hw.mac.max_speed) { + dev_err(&hdev->pdev->dev, + "invalid max_tx_rate:%d [0, %u]\n", + max_tx_rate, hdev->hw.mac.max_speed); + return -EINVAL; + } + + ret = hclge_tm_qs_shaper_cfg(vport, max_tx_rate); + if (ret) + return ret; + + vport->vf_info.max_tx_rate = max_tx_rate; + + return ret ? (ssize_t)ret : size; +} + +static struct kobj_attribute hclge_attr_max_tx_rate = { + .attr = {.name = "max_tx_rate", + .mode = 0644 }, + .show = hclge_max_tx_rate_show, + .store = hclge_max_tx_rate_store, +}; + +static struct attribute *hclge_sysfs_attrs[] = { + &hclge_attr_max_tx_rate.attr, + NULL, +}; + +static struct kobj_type hclge_sysfs_type = { + .sysfs_ops = &kobj_sysfs_ops, + .default_attrs = hclge_sysfs_attrs, +}; + +void hclge_sysfs_init(struct hnae3_handle *handle) +{ + struct net_device *netdev = handle->netdev; + struct hclge_vport *vport = hclge_get_vport(handle); + int ret; + + handle->kobj = kobject_create_and_add("kunpeng", &netdev->dev.kobj); + if (!handle->kobj) { + netdev_err(netdev, "failed to create kobj, ret = %d\n", ret); + return; + } + + ret = kobject_init_and_add(&vport->kobj, &hclge_sysfs_type, + handle->kobj, "pf"); + if (ret) { + netdev_err(netdev, "failed to init kobj, ret = %d\n", ret); + kobject_put(handle->kobj); + handle->kobj = NULL; + } +} + +void hclge_sysfs_uninit(struct hnae3_handle *handle) +{ + struct hclge_vport *vport = hclge_get_vport(handle); + + if (!handle->kobj) + return; + + kobject_put(&vport->kobj); + kobject_put(handle->kobj); + handle->kobj = NULL; +} diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_extension/hns3pf/hclge_sysfs.h b/drivers/net/ethernet/hisilicon/hns3/hns3_extension/hns3pf/hclge_sysfs.h new file mode 100644 index 0000000000000..8eb33357c5779 --- /dev/null +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_extension/hns3pf/hclge_sysfs.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0+ + * Copyright (c) 2018-2021 Hisilicon Limited. + */ + +#ifndef __HCLGE_SYSFS_H +#define __HCLGE_SYSFS_H + +void hclge_reset_pf_rate(struct hclge_dev *hdev); +int hclge_resume_pf_rate(struct hclge_dev *hdev); + +void hclge_sysfs_init(struct hnae3_handle *handle); +void hclge_sysfs_uninit(struct hnae3_handle *handle); + +#endif diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index b40cf43ba8c0d..7b6d7c157747e 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -9946,6 +9946,11 @@ static int hclge_init_nic_client_instance(struct hnae3_ae_dev *ae_dev, if (ret) return ret;
+#ifdef CONFIG_HNS3_TEST + if (ae_dev->ops->ext_init) + ae_dev->ops->ext_init(&vport->nic); +#endif + set_bit(HCLGE_STATE_NIC_REGISTERED, &hdev->state); if (test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state) || rst_cnt != hdev->rst_stats.reset_cnt) { @@ -10087,6 +10092,13 @@ static void hclge_uninit_client_instance(struct hnae3_client *client, struct hclge_vport *vport; int i;
+#ifdef CONFIG_HNS3_TEST + if (ae_dev->ops->ext_uninit) { + vport = &hdev->vport[0]; + ae_dev->ops->ext_uninit(&vport->nic); + } +#endif + for (i = 0; i < hdev->num_vmdq_vport + 1; i++) { vport = &hdev->vport[i]; if (hdev->roce_client) { @@ -10817,6 +10829,11 @@ static int hclge_reset_ae_dev(struct hnae3_ae_dev *ae_dev) if (ret) return ret;
+#ifdef CONFIG_HNS3_TEST + if (ae_dev->ops->ext_reset_done) + ae_dev->ops->ext_reset_done(&hdev->vport->nic); +#endif + dev_info(&pdev->dev, "Reset done, %s driver initialization finished.\n", HCLGE_DRIVER_NAME);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index 08d17ba61960e..f803a9d895c06 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -970,6 +970,11 @@ struct hclge_vport { struct list_head uc_mac_list; /* Store VF unicast table */ struct list_head mc_mac_list; /* Store VF multicast table */ struct list_head vlan_list; /* Store VF vlan table */ + +#ifdef CONFIG_HNS3_TEST + /* for sysfs */ + struct kobject kobj; +#endif };
int hclge_set_vport_promisc_mode(struct hclge_vport *vport, bool en_uc_pmc,
From: Yonglong Liu liuyonglong@huawei.com
driver inclusion category: feature bugzilla: NA CVE: NA
-----------------------------
This patch is used to update driver version to 1.9.38.10.
Signed-off-by: Yonglong Liu liuyonglong@huawei.com Reviewed-by: li yongxin liyongxin1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 2 +- drivers/net/ethernet/hisilicon/hns3/hns3_cae/hns3_cae_version.h | 2 +- drivers/net/ethernet/hisilicon/hns3/hns3_enet.h | 2 +- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h | 2 +- drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index 7b3938054e5f1..d92c64c74451f 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -30,7 +30,7 @@ #include <linux/pci.h> #include <linux/types.h>
-#define HNAE3_MOD_VERSION "1.9.38.9" +#define HNAE3_MOD_VERSION "1.9.38.10"
#define HNAE3_MIN_VECTOR_NUM 2 /* first one for misc, another for IO */
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_cae/hns3_cae_version.h b/drivers/net/ethernet/hisilicon/hns3/hns3_cae/hns3_cae_version.h index 782ed542b1a43..5563070bd4a2e 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_cae/hns3_cae_version.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_cae/hns3_cae_version.h @@ -4,7 +4,7 @@ #ifndef __HNS3_CAE_VERSION_H__ #define __HNS3_CAE_VERSION_H__
-#define HNS3_CAE_MOD_VERSION "1.9.38.9" +#define HNS3_CAE_MOD_VERSION "1.9.38.10"
#define CMT_ID_LEN 8 #define RESV_LEN 3 diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index c174bdbb98c47..c44e3ba782e52 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -8,7 +8,7 @@
#include "hnae3.h"
-#define HNS3_MOD_VERSION "1.9.38.9" +#define HNS3_MOD_VERSION "1.9.38.10"
extern char hns3_driver_version[];
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index f803a9d895c06..6e89f8f78f0ff 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -12,7 +12,7 @@ #include "hclge_cmd.h" #include "hnae3.h"
-#define HCLGE_MOD_VERSION "1.9.38.9" +#define HCLGE_MOD_VERSION "1.9.38.10" #define HCLGE_DRIVER_NAME "hclge"
#define HCLGE_MAX_PF_NUM 8 diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h index 4f75b36c7888d..3b133a3b49058 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h @@ -10,7 +10,7 @@ #include "hclgevf_cmd.h" #include "hnae3.h"
-#define HCLGEVF_MOD_VERSION "1.9.38.9" +#define HCLGEVF_MOD_VERSION "1.9.38.10" #define HCLGEVF_DRIVER_NAME "hclgevf"
#define HCLGEVF_MAX_VLAN_ID 4095
From: Yonglong Liu liuyonglong@huawei.com
driver inclusion category: bugfix bugzilla: NA CVE: NA
-----------------------------
This patch fix the following warning: drivers/net/ethernet/hisilicon/hns3/hns3_extension/hns3pf/hclge_sysfs.c:117:3: warning: ‘ret’ may be used uninitialized in this function [-Wmaybe-uninitialized] netdev_err(netdev, "failed to create kobj, ret = %d\n", ret);
Fixes: f0ab3ab6c0fa ("net: hns3: adds support for setting pf max tx rate via sysfs") Signed-off-by: Yonglong Liu liuyonglong@huawei.com Reviewed-by: Zhong Zhaohui zhongzhaohui@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- .../ethernet/hisilicon/hns3/hns3_extension/hns3pf/hclge_sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_extension/hns3pf/hclge_sysfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3_extension/hns3pf/hclge_sysfs.c index 86e60938013c2..a307b592dc927 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_extension/hns3pf/hclge_sysfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_extension/hns3pf/hclge_sysfs.c @@ -114,7 +114,7 @@ void hclge_sysfs_init(struct hnae3_handle *handle)
handle->kobj = kobject_create_and_add("kunpeng", &netdev->dev.kobj); if (!handle->kobj) { - netdev_err(netdev, "failed to create kobj, ret = %d\n", ret); + netdev_err(netdev, "failed to create kobj!\n"); return; }
From: Yonglong Liu liuyonglong@huawei.com
driver inclusion category: bugfix bugzilla: NA CVE: NA
-----------------------------
This patch is used to update driver version to 1.9.38.11.
Signed-off-by: Yonglong Liu liuyonglong@huawei.com Reviewed-by: Zhong Zhaohui zhongzhaohui@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 2 +- drivers/net/ethernet/hisilicon/hns3/hns3_cae/hns3_cae_version.h | 2 +- drivers/net/ethernet/hisilicon/hns3/hns3_enet.h | 2 +- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h | 2 +- drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index d92c64c74451f..e407343fd0954 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -30,7 +30,7 @@ #include <linux/pci.h> #include <linux/types.h>
-#define HNAE3_MOD_VERSION "1.9.38.10" +#define HNAE3_MOD_VERSION "1.9.38.11"
#define HNAE3_MIN_VECTOR_NUM 2 /* first one for misc, another for IO */
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_cae/hns3_cae_version.h b/drivers/net/ethernet/hisilicon/hns3/hns3_cae/hns3_cae_version.h index 5563070bd4a2e..20892a86599ab 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_cae/hns3_cae_version.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_cae/hns3_cae_version.h @@ -4,7 +4,7 @@ #ifndef __HNS3_CAE_VERSION_H__ #define __HNS3_CAE_VERSION_H__
-#define HNS3_CAE_MOD_VERSION "1.9.38.10" +#define HNS3_CAE_MOD_VERSION "1.9.38.11"
#define CMT_ID_LEN 8 #define RESV_LEN 3 diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index c44e3ba782e52..88c843d562d0d 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -8,7 +8,7 @@
#include "hnae3.h"
-#define HNS3_MOD_VERSION "1.9.38.10" +#define HNS3_MOD_VERSION "1.9.38.11"
extern char hns3_driver_version[];
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index 6e89f8f78f0ff..6b9ca5a5c48b7 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -12,7 +12,7 @@ #include "hclge_cmd.h" #include "hnae3.h"
-#define HCLGE_MOD_VERSION "1.9.38.10" +#define HCLGE_MOD_VERSION "1.9.38.11" #define HCLGE_DRIVER_NAME "hclge"
#define HCLGE_MAX_PF_NUM 8 diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h index 3b133a3b49058..17cf19719b151 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h @@ -10,7 +10,7 @@ #include "hclgevf_cmd.h" #include "hnae3.h"
-#define HCLGEVF_MOD_VERSION "1.9.38.10" +#define HCLGEVF_MOD_VERSION "1.9.38.11" #define HCLGEVF_DRIVER_NAME "hclgevf"
#define HCLGEVF_MAX_VLAN_ID 4095
From: shiyongbang shiyongbang@huawei.com
driver inclusion category: bugfix bugzilla: NA CVE: NA
After the connector is initialized, the drm_connector_register interface is invoked to rectify the erratic display during the startup process during the installation using the ISO file.
Signed-off-by: Shiyongbang shiyongbang@huawei.com Reviewed-by: Wu yang wuyang7@huawei.com Reviewed-by: Li dongming lidongming5@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_vdac.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_vdac.c b/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_vdac.c index 879ffb8c2413a..90319a9025d3a 100644 --- a/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_vdac.c +++ b/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_vdac.c @@ -123,6 +123,7 @@ hibmc_connector_init(struct hibmc_drm_private *priv) } drm_connector_helper_add(connector, &hibmc_connector_helper_funcs); + drm_connector_register(connector);
return connector; }
From: shiyongbang shiyongbang@huawei.com
driver inclusion category: bugfix bugzilla: NA CVE: NA
The hibmc driver uses duplicate implementations with drm_get_pci_dev for its probe. Replace the code with the generic function drm_get_pci_dev.
Signed-off-by: Shiyongbang shiyongbang@huawei.com Reviewed-by: Wu yang wuyang7@huawei.com Reviewed-by: Li dongming lidongming5@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- .../gpu/drm/hisilicon/hibmc/hibmc_drm_drv.c | 52 +++---------------- .../gpu/drm/hisilicon/hibmc/hibmc_drm_drv.h | 2 + 2 files changed, 8 insertions(+), 46 deletions(-)
diff --git a/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_drv.c b/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_drv.c index e08655aaf14a7..b9a6643a65158 100644 --- a/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_drv.c +++ b/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_drv.c @@ -58,6 +58,8 @@ irqreturn_t hibmc_drm_interrupt(int irq, void *arg) static struct drm_driver hibmc_driver = { .driver_features = DRIVER_GEM | DRIVER_MODESET | DRIVER_ATOMIC | DRIVER_HAVE_IRQ, + .load = hibmc_load, + .unload = hibmc_unload, .fops = &hibmc_fops, .name = "hibmc", .date = "20160828", @@ -284,7 +286,7 @@ static int hibmc_hw_init(struct hibmc_drm_private *priv) return 0; }
-static int hibmc_unload(struct drm_device *dev) +void hibmc_unload(struct drm_device *dev) { struct hibmc_drm_private *priv = dev->dev_private;
@@ -301,10 +303,9 @@ static int hibmc_unload(struct drm_device *dev) hibmc_mm_fini(priv); hibmc_hw_unmap(priv); dev->dev_private = NULL; - return 0; }
-static int hibmc_load(struct drm_device *dev) +int hibmc_load(struct drm_device *dev, unsigned long flags) { struct hibmc_drm_private *priv; int ret; @@ -366,56 +367,15 @@ static int hibmc_load(struct drm_device *dev) static int hibmc_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { - struct drm_device *dev; - int ret; - - dev = drm_dev_alloc(&hibmc_driver, &pdev->dev); - if (IS_ERR(dev)) { - DRM_ERROR("failed to allocate drm_device\n"); - return PTR_ERR(dev); - } - - dev->pdev = pdev; - pci_set_drvdata(pdev, dev); - - ret = pci_enable_device(pdev); - if (ret) { - DRM_ERROR("failed to enable pci device: %d\n", ret); - goto err_free; - } - - ret = hibmc_load(dev); - if (ret) { - DRM_ERROR("failed to load hibmc: %d\n", ret); - goto err_disable; - } - - ret = drm_dev_register(dev, 0); - if (ret) { - DRM_ERROR("failed to register drv for userspace access: %d\n", - ret); - goto err_unload; - } - return 0; - -err_unload: - hibmc_unload(dev); -err_disable: - pci_disable_device(pdev); -err_free: - drm_dev_unref(dev); - - return ret; + return drm_get_pci_dev(pdev, ent, &hibmc_driver); }
static void hibmc_pci_remove(struct pci_dev *pdev) { struct drm_device *dev = pci_get_drvdata(pdev);
- drm_dev_unregister(dev); - hibmc_unload(dev); + drm_put_dev(dev); pci_disable_device(pdev); - drm_dev_unref(dev); }
static void hibmc_pci_shutdown(struct pci_dev *pdev) diff --git a/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_drv.h b/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_drv.h index e195521eb41e9..4395dc6674bbc 100644 --- a/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_drv.h +++ b/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_drv.h @@ -85,6 +85,8 @@ void hibmc_set_power_mode(struct hibmc_drm_private *priv, unsigned int power_mode); void hibmc_set_current_gate(struct hibmc_drm_private *priv, unsigned int gate); +int hibmc_load(struct drm_device *dev, unsigned long flags); +void hibmc_unload(struct drm_device *dev);
int hibmc_de_init(struct hibmc_drm_private *priv); int hibmc_vdac_init(struct hibmc_drm_private *priv);
From: shiyongbang shiyongbang@huawei.com
driver inclusion category: bugfix bugzilla: NA CVE: NA
Add remove_conflicting_framebuffers function to fix stuck problem when switch GUI to text during installation using the ISO file.
Signed-off-by: Shiyongbang shiyongbang@huawei.com Reviewed-by: Wu yang wuyang7@huawei.com Reviewed-by: Li dongming lidongming5@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- .../gpu/drm/hisilicon/hibmc/hibmc_drm_drv.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+)
diff --git a/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_drv.c b/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_drv.c index b9a6643a65158..7be784a77efa3 100644 --- a/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_drv.c +++ b/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_drv.c @@ -55,6 +55,22 @@ irqreturn_t hibmc_drm_interrupt(int irq, void *arg) return IRQ_HANDLED; }
+static void hibmc_remove_framebuffers(struct pci_dev *pdev) +{ + struct apertures_struct *ap; + + ap = alloc_apertures(1); + if (!ap) + return; + + ap->ranges[0].base = pci_resource_start(pdev, 0); + ap->ranges[0].size = pci_resource_len(pdev, 0); + + drm_fb_helper_remove_conflicting_framebuffers(ap, "hibmcdrmfb", false); + + kfree(ap); +} + static struct drm_driver hibmc_driver = { .driver_features = DRIVER_GEM | DRIVER_MODESET | DRIVER_ATOMIC | DRIVER_HAVE_IRQ, @@ -367,6 +383,8 @@ int hibmc_load(struct drm_device *dev, unsigned long flags) static int hibmc_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { + hibmc_remove_framebuffers(pdev); + return drm_get_pci_dev(pdev, ent, &hibmc_driver); }
From: Dong Kai dongkai11@huawei.com
hulk inclusion category: bugfix bugzilla: 50425 CVE: NA
---------------------------
The jump_label_apply_nops is used to replace default nops with ideal nops. This can be called only once per module. The current logic is incorrent as it is called per klp_object. Then if klp patch contains multi klp_object to be patched, it will be called more than one times, and lead to following crash:
livepatch_scsi_test: loading out-of-tree module taints kernel. livepatch_scsi_test: tainting kernel with TAINT_LIVEPATCH jump_label: Fatal kernel bug, unexpected op at patch_exit+0x0/0x38 [livepatch_scsi_test] [(____ptrval____)] (66 66 66 66 90) 80 ------------[ cut here ]------------ kernel BUG at arch/x86/kernel/jump_label.c:37! invalid opcode: 0000 [#1] SMP NOPTI CPU: 0 PID: 116 Comm: insmod Tainted: G OE K --------- - - 4.18.0+ #13 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014 RIP: 0010:bug_at+0x1d/0x20 Code: 02 ba d1 04 00 00 ee c3 90 90 90 90 90 66 66 66 66 90 41 89 f0 48 89 f9 48 89 fa 48 89 fe 48 c7 c7 98 79 c6 9d e8 01 79 0f 00 <0f> 0b 90 66 66 66 66 90 48 83 ec 18 48 8b 3f 65 48 8b 04 25 28
RSP: 0018:ffffb074c0563bb0 EFLAGS: 00000282 RAX: 000000000000007f RBX: ffffffffc036f018 RCX: ffffffff9de5abc8 RDX: 0000000000000000 RSI: 0000000000000082 RDI: 0000000000000246 RBP: ffffffffc036f018 R08: 0000000000000175 R09: 20363628205d295f R10: ffffa00743f8b580 R11: 3038202930392036 R12: ffffa00743f7fb40 R13: ffffffffc036f100 R14: ffffa00743f22160 R15: 0000000000000001 FS: 0000000001a088c0(0000) GS:ffffa00747800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000001a0f000 CR3: 0000000003fb6000 CR4: 00000000000006f0 Call Trace: __jump_label_transform.isra.0+0x50/0x130 jump_label_apply_nops+0x64/0x70 klp_register_patch+0x55b/0x6f0 ? _cond_resched+0x15/0x40 ? kmem_cache_alloc_trace+0x1a6/0x1c0 ? patch_free_scaffold+0xa7/0xbf [livepatch_scsi_test] patch_init+0x43e/0x1000 [livepatch_scsi_test] ? 0xffffffffc037f000 do_one_initcall+0x46/0x1c8 ? free_unref_page_commit+0x95/0x120 ? _cond_resched+0x15/0x40 ? kmem_cache_alloc_trace+0x3e/0x1c0 do_init_module+0x5b/0x1fc load_module+0x15b1/0x1e50 ? vmap_page_range_noflush+0x33f/0x4a0 ? __do_sys_init_module+0x167/0x1a0 __do_sys_init_module+0x167/0x1a0 do_syscall_64+0x5b/0x1b0 entry_SYSCALL_64_after_hwframe+0x65/0xca RIP: 0033:0x4d63a9
To solve this, we put the jump_label_apply_nops to klp_init_patch.
Fixes: 292937f547e6 ("livepatch/core: support jump_label") Signed-off-by: Dong Kai dongkai11@huawei.com Reviewed-by: Yang Jihong yangjihong1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/livepatch/core.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-)
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 81c8b02ce3d15..c981d400fe552 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -1074,17 +1074,10 @@ static int klp_init_object_loaded(struct klp_patch *patch, }
arch_klp_init_object_loaded(patch, obj); - - set_mod_klp_rel_state(patch->mod, MODULE_KLP_REL_DONE); - jump_label_apply_nops(patch->mod); module_enable_ro(patch->mod, true);
mutex_unlock(&text_mutex);
- ret = jump_label_register(patch->mod); - if (ret) - return ret; - klp_for_each_func(obj, func) { ret = klp_find_object_symbol(obj->name, func->old_name, func->old_sympos, @@ -1222,6 +1215,19 @@ static int klp_init_patch(struct klp_patch *patch) goto free; }
+ set_mod_klp_rel_state(patch->mod, MODULE_KLP_REL_DONE); + mutex_lock(&text_mutex); + module_disable_ro(patch->mod); + jump_label_apply_nops(patch->mod); + ret = jump_label_register(patch->mod); + if (ret) { + module_enable_ro(patch->mod, true); + mutex_unlock(&text_mutex); + goto free; + } + module_enable_ro(patch->mod, true); + mutex_unlock(&text_mutex); + #ifdef CONFIG_LIVEPATCH_WO_FTRACE klp_for_each_object(patch, obj) klp_load_hook(obj);
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: bugfix bugzilla: 47462 CVE: NA
-------------------------------------------------
do_munmap() is called in sp_munmap_task_areas(), we should leverage mm->mmap_sem.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 4fa693bd7f973..a14e8c678bbc3 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -661,7 +661,9 @@ int sp_group_add_task(int pid, int spg_id) "failed (potential no enough memory): %d " "spa type is %d\n", ret, spa->type); } + down_write(&mm->mmap_sem); sp_munmap_task_areas(mm, spa->link.next); + up_write(&mm->mmap_sem); spin_lock(&sp_area_lock); break; }
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: bugfix bugzilla: 47462 CVE: NA
-------------------------------------------------
KASAN report: [ 127.094921] BUG: KASAN: use-after-free in rb_next+0x18/0xa8 [ 127.095591] Read of size 8 at addr ffff8000cffb0130 by task cat/642 [ 127.096169] [ 127.096935] CPU: 1 PID: 642 Comm: cat Tainted: G OE 4.19.170+ #168 [ 127.097499] Hardware name: linux,dummy-virt (DT) [ 127.098200] Call trace: [ 127.098508] dump_backtrace+0x0/0x268 [ 127.098885] show_stack+0x24/0x30 [ 127.099241] dump_stack+0x104/0x15c [ 127.099754] print_address_description+0x68/0x278 [ 127.100317] kasan_report+0x208/0x328 [ 127.100683] __asan_load8+0x84/0xa8 [ 127.101035] rb_next+0x18/0xa8 [ 127.101355] spa_stat_show+0x148/0x378 [ 127.101746] seq_read+0x160/0x730 [ 127.102106] proc_reg_read+0xac/0x100 [ 127.102492] do_iter_read+0x248/0x290 [ 127.102860] vfs_readv+0xe4/0x140 [ 127.103220] default_file_splice_read+0x298/0x4e0 [ 127.103765] do_splice_to+0xa8/0xe0 [ 127.104179] splice_direct_to_actor+0x180/0x3d8 [ 127.104603] do_splice_direct+0x100/0x178 [ 127.104991] do_sendfile+0x2ec/0x520 [ 127.105363] __arm64_sys_sendfile64+0x204/0x250 [ 127.105792] el0_svc_common+0xb0/0x2d0 [ 127.106168] el0_svc_handler+0x40/0x90 [ 127.106523] el0_svc+0x10/0x248
The reason is that __sp_area_drop_locked(spa) may free the spa and its corresponding rbtree node. Then the node of rb_next(node) is use-after-free.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index a14e8c678bbc3..e7926581d9e16 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -2513,12 +2513,15 @@ int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, static void rb_spa_stat_show(struct seq_file *seq) { struct rb_node *node; - struct sp_area *spa; + struct sp_area *spa, *prev = NULL;
spin_lock(&sp_area_lock);
for (node = rb_first(&sp_area_root); node; node = rb_next(node)) { + __sp_area_drop_locked(prev); + spa = rb_entry(node, struct sp_area, rb_node); + prev = spa; atomic_inc(&spa->use_count); spin_unlock(&sp_area_lock);
@@ -2557,9 +2560,8 @@ static void rb_spa_stat_show(struct seq_file *seq) seq_printf(seq, "%-10d\n", atomic_read(&spa->use_count));
spin_lock(&sp_area_lock); - __sp_area_drop_locked(spa); } - + __sp_area_drop_locked(prev); spin_unlock(&sp_area_lock); }
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: perf bugzilla: 47462 CVE: NA
-------------------------------------------------
For the new fine grained locking design, ESPGMMEXIT is no longer needed.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index e7926581d9e16..fec1c60db869b 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -51,7 +51,6 @@ #define AC_SINGLE_OWNER 1
#define spg_valid(spg) ((spg) && ((spg)->is_alive == true)) -#define ESPGMMEXIT 4000
#define byte2kb(size) ((size) >> 10) #define byte2mb(size) ((size) >> 20) @@ -1617,18 +1616,13 @@ static void *sp_make_share_kva_to_spg(unsigned long kva, struct sp_area *spa,
list_for_each_entry_safe(mm, tmp, &spg->procs, sp_node) { ret_addr = sp_remap_kva_to_vma(kva, spa, mm); - if (IS_ERR_VALUE(ret_addr) && (ret_addr != -ESPGMMEXIT)) { + if (IS_ERR_VALUE(ret_addr)) { pr_err("share pool: remap k2u to spg failed, ret %ld \n", ret_addr); __sp_free(spg, spa->va_start, spa_size(spa), mm); p = ERR_PTR(ret_addr); goto out; }
- if (ret_addr == -ESPGMMEXIT) { - pr_info("share pool: remap k2u, ret is -ESPGMMEXIT\n"); - continue; - } - uva = ret_addr; } p = (void *)uva;
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: doc bugzilla: 47462 CVE: NA
-------------------------------------------------
Some of the comments are outdated and need to be update.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/share_pool.h | 4 ++-- mm/share_pool.c | 25 +++++++++++++++++++++---- 2 files changed, 23 insertions(+), 6 deletions(-)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 4a18c88d5a10e..26e44d51fd849 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -74,7 +74,7 @@ struct sp_group { struct file *file_hugetlb; /* list head of processes */ struct list_head procs; - /* list of sp_area */ + /* list of sp_area. it is protected by spin_lock sp_area_lock */ struct list_head spa_list; /* number of sp_area */ atomic_t spa_num; @@ -95,7 +95,7 @@ struct sp_group { unsigned long dvpp_va_start; unsigned long dvpp_size; atomic_t use_count; - /* protect the group internal elements */ + /* protect the group internal elements, except spa_list */ struct rw_semaphore rw_lock; };
diff --git a/mm/share_pool.c b/mm/share_pool.c index fec1c60db869b..1fc27cfd3932e 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -574,7 +574,18 @@ int sp_group_add_task(int pid, int spg_id) goto out_unlock; }
- /* current thread may be exiting in a multithread process */ + /* + * group_leader: current thread may be exiting in a multithread process + * + * DESIGN IDEA + * We increase mm->mm_users deliberately to ensure it's decreased in + * share pool under only 2 circumstances, which will simply the overall + * design as mm won't be freed unexpectedly. + * + * The corresponding refcount decrements are as follows: + * 1. the error handling branch of THIS function. + * 2. In sp_group_exit(). It's called only when process is exiting. + */ mm = get_task_mm(tsk->group_leader); if (!mm) { ret = -ESRCH; @@ -677,6 +688,7 @@ int sp_group_add_task(int pid, int spg_id) idr_remove(&sp_stat_idr, mm->sp_stat_id); kfree(stat); mm->sp_stat_id = 0; + /* spg->procs is modified, spg->rw_lock should be put below */ list_del(&mm->sp_node); mm->sp_group = NULL; } @@ -686,7 +698,7 @@ int sp_group_add_task(int pid, int spg_id) if (unlikely(ret)) __sp_group_drop_locked(spg); out_put_mm: - /* No need to put the mm if the sp group add this mm success.*/ + /* No need to put the mm if the sp group adds this mm successfully */ if (unlikely(ret)) mmput(mm); out_put_task: @@ -1504,6 +1516,7 @@ static unsigned long __sp_remap_get_pfn(unsigned long kva) return pfn; }
+/* when called by k2u to group, always make sure rw_lock of spg is down */ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, struct mm_struct *mm) { @@ -2857,8 +2870,11 @@ void sp_group_exit(struct mm_struct *mm)
spg = mm->sp_group;
- /* If the mm_users is 2, it means that the mm is ready to be freed - because the last owner of this mm is in exiting process. + /* + * Recall we add mm->users by 1 deliberately in sp_group_add_task(). + * If the mm_users is 2, it means that the mm is ready to be freed + * because the last owner of this mm is in exiting procedure: + * do_exit() -> exit_mm() -> mmput() -> THIS function. */ if (spg_valid(spg) && atomic_read(&mm->mm_users) == MM_WOULD_FREE) { spg_exit_lock(&unlock); @@ -2870,6 +2886,7 @@ void sp_group_exit(struct mm_struct *mm) if (!is_alive) blocking_notifier_call_chain(&sp_notifier_chain, 0, mm->sp_group); + /* match with get_task_mm() in sp_group_add_task() */ atomic_dec(&mm->mm_users); spg_exit_unlock(unlock); }
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: perf bugzilla: 47462 CVE: NA
-------------------------------------------------
struct sp_proc_stat for a process using share pool may be accessed concurrently. If use type atomic64_t in it, locks won't be needed anymore.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/share_pool.h | 4 ++-- mm/oom_kill.c | 4 +++- mm/share_pool.c | 47 +++++++++++++++++++++++--------------- 3 files changed, 33 insertions(+), 22 deletions(-)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 26e44d51fd849..356781bfe3e0a 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -115,8 +115,8 @@ struct sp_proc_stat { * alloc amount minus free amount, may be negative when freed by * another task in the same sp group. */ - long alloc_size; - long k2u_size; + atomic64_t alloc_size; + atomic64_t k2u_size; };
#ifdef CONFIG_ASCEND_SHARE_POOL diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1ca0f28c99636..86db5d5508234 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -434,7 +434,9 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) if (!stat) pr_cont("%-9c %-9c ", '-', '-'); else - pr_cont("%-9ld %-9ld ", (stat->alloc_size) >> 10, (stat->k2u_size) >> 10); /* byte to KB */ + pr_cont("%-9ld %-9ld ", /* byte to KB */ + atomic64_read(&stat->alloc_size) >> 10, + atomic64_read(&stat->k2u_size) >> 10); pr_cont("%8ld %8lu %5hd %s\n", mm_pgtables_bytes(task->mm), get_mm_counter(task->mm, MM_SWAPENTS), diff --git a/mm/share_pool.c b/mm/share_pool.c index 1fc27cfd3932e..38a9c7a88afc0 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -113,7 +113,8 @@ static struct sp_proc_stat *sp_init_proc_stat(struct task_struct *tsk, return ERR_PTR(-ENOMEM); }
- stat->alloc_size = stat->k2u_size = 0; + atomic64_set(&stat->alloc_size, 0); + atomic64_set(&stat->k2u_size, 0); stat->mm = mm; get_task_comm(stat->comm, tsk); ret = idr_alloc(&sp_stat_idr, stat, tgid, tgid + 1, GFP_KERNEL); @@ -736,6 +737,7 @@ static void spg_exit_unlock(bool unlock) void sp_group_post_exit(struct mm_struct *mm) { struct sp_proc_stat *stat; + long alloc_size, k2u_size; bool unlock;
if (!enable_ascend_share_pool || !mm->sp_group) @@ -757,13 +759,15 @@ void sp_group_post_exit(struct mm_struct *mm) * * We decide to print a info when seeing both of the scenarios. */ - if (stat && (stat->alloc_size != 0 || stat->k2u_size != 0)) - pr_info("share pool: process %s(%d) of sp group %d exits. " - "It applied %ld aligned KB, k2u shared %ld aligned " - "KB\n", - stat->comm, mm->sp_stat_id, - mm->sp_group->id, byte2kb(stat->alloc_size), - byte2kb(stat->k2u_size)); + if (stat) { + alloc_size = atomic64_read(&stat->alloc_size); + k2u_size = atomic64_read(&stat->k2u_size); + if (alloc_size != 0 || k2u_size != 0) + pr_info("share pool: process %s(%d) of sp group %d exits. " + "It applied %ld aligned KB, k2u shared %ld aligned KB\n", + stat->comm, mm->sp_stat_id, mm->sp_group->id, + byte2kb(alloc_size), byte2kb(k2u_size)); + }
idr_remove(&sp_stat_idr, mm->sp_stat_id);
@@ -1217,11 +1221,11 @@ int sp_free(unsigned long addr) mutex_lock(&sp_mutex); /* pointer stat may be invalid because of kthread buff_module_guard_work */ if (current->mm == NULL) { - kthread_stat.alloc_size -= spa->real_size; + atomic64_sub(spa->real_size, &kthread_stat.alloc_size); } else { stat = idr_find(&sp_stat_idr, current->mm->sp_stat_id); if (stat) - stat->alloc_size -= spa->real_size; + atomic64_sub(spa->real_size, &stat->alloc_size); else BUG(); } @@ -1464,7 +1468,7 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) if (!IS_ERR(p)) { stat = idr_find(&sp_stat_idr, current->mm->sp_stat_id); if (stat) - stat->alloc_size += size_aligned; + atomic64_add(size_aligned, &stat->alloc_size); } mutex_unlock(&sp_mutex);
@@ -1824,7 +1828,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, if (!IS_ERR(uva)) { mutex_lock(&sp_mutex); uva = uva + (kva - kva_aligned); - stat->k2u_size += size_aligned; + atomic64_add(size_aligned, &stat->k2u_size); mutex_unlock(&sp_mutex); } else { /* associate vma and spa */ @@ -2254,11 +2258,11 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp mutex_lock(&sp_mutex); /* pointer stat may be invalid because of kthread buff_module_guard_work */ if (current->mm == NULL) { - kthread_stat.k2u_size -= spa->real_size; + atomic64_sub(spa->real_size, &kthread_stat.k2u_size); } else { stat = idr_find(&sp_stat_idr, current->mm->sp_stat_id); if (stat) - stat->k2u_size -= spa->real_size; + atomic64_sub(spa->real_size, &stat->k2u_size); else WARN(1, "share_pool: %s: null process stat\n", __func__); } @@ -2510,7 +2514,9 @@ int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, seq_printf(m, "%-8s %-9s %-13s\n", "Group_ID", "SP_ALLOC", "HugePage Fail"); seq_printf(m, "%-8d %-9ld %-13d\n", - spg->id, byte2kb(stat->alloc_size), spg->hugepage_failures); + spg->id, + byte2kb(atomic64_read(&stat->alloc_size)), + spg->hugepage_failures); } mutex_unlock(&sp_mutex);
@@ -2729,8 +2735,10 @@ static int idr_proc_stat_cb(int id, void *p, void *data) else seq_printf(seq, "%-8d ", spg_id); seq_printf(seq, "%-9ld %-9ld %-9ld %-10ld %-8ld %-7ld %-7ld %-10ld\n", - byte2kb(stat->alloc_size), byte2kb(stat->k2u_size), sp_res, - non_sp_res, page2kb(mm->total_vm), page2kb(total_rss), + byte2kb(atomic64_read(&stat->alloc_size)), + byte2kb(atomic64_read(&stat->k2u_size)), + sp_res, non_sp_res, + page2kb(mm->total_vm), page2kb(total_rss), page2kb(shmem), non_sp_shm); mmput(mm);
@@ -2750,8 +2758,9 @@ static int proc_stat_show(struct seq_file *seq, void *offset) "Non-SP_RES", "VIRT", "RES", "Shm", "Non-SP_Shm"); /* print kthread buff_module_guard_work */ seq_printf(seq, "%-8s %-8s %-9ld %-9ld\n", - "guard", "-", byte2kb(kthread_stat.alloc_size), - byte2kb(kthread_stat.k2u_size)); + "guard", "-", + byte2kb(atomic64_read(&kthread_stat.alloc_size)), + byte2kb(atomic64_read(&kthread_stat.k2u_size))); idr_for_each(&sp_stat_idr, idr_proc_stat_cb, seq); return 0; }
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: perf bugzilla: 47462 CVE: NA
-------------------------------------------------
Introduce rw_semaphore sp_stat_sem, it only protects the idr operations of sp_stat_idr.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 66 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 23 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 38a9c7a88afc0..2517e861c1fc6 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -23,6 +23,7 @@ #include <linux/mm_types.h> #include <linux/idr.h> #include <linux/mutex.h> +#include <linux/rwsem.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/rbtree.h> @@ -83,6 +84,8 @@ static DEFINE_IDA(sp_group_id_ida);
/* idr of all sp_proc_stats */ static DEFINE_IDR(sp_stat_idr); +/* rw semaphore for sp_stat_idr */ +static DECLARE_RWSEM(sp_stat_sem);
/* for kthread buff_module_guard_work */ static struct sp_proc_stat kthread_stat = {0}; @@ -100,7 +103,7 @@ static struct sp_proc_stat *sp_init_proc_stat(struct task_struct *tsk, int ret;
if (id) { - stat = idr_find(&sp_stat_idr, id); + stat = sp_get_proc_stat(id); /* other threads in the same process may have initialized it */ if (stat) return stat; @@ -117,7 +120,10 @@ static struct sp_proc_stat *sp_init_proc_stat(struct task_struct *tsk, atomic64_set(&stat->k2u_size, 0); stat->mm = mm; get_task_comm(stat->comm, tsk); + + down_write(&sp_stat_sem); ret = idr_alloc(&sp_stat_idr, stat, tgid, tgid + 1, GFP_KERNEL); + up_write(&sp_stat_sem); if (ret < 0) { if (printk_ratelimit()) pr_err("share pool: proc stat idr alloc failed %d\n", ret); @@ -686,15 +692,20 @@ int sp_group_add_task(int pid, int spg_id) spin_unlock(&sp_area_lock);
if (unlikely(ret)) { - idr_remove(&sp_stat_idr, mm->sp_stat_id); - kfree(stat); - mm->sp_stat_id = 0; /* spg->procs is modified, spg->rw_lock should be put below */ list_del(&mm->sp_node); mm->sp_group = NULL; } - up_write(&spg->rw_lock); + + if (unlikely(ret)) { + down_write(&sp_stat_sem); + idr_remove(&sp_stat_idr, mm->sp_stat_id); + up_write(&sp_stat_sem); + kfree(stat); + mm->sp_stat_id = 0; + } + out_drop_group: if (unlikely(ret)) __sp_group_drop_locked(spg); @@ -743,10 +754,7 @@ void sp_group_post_exit(struct mm_struct *mm) if (!enable_ascend_share_pool || !mm->sp_group) return;
- spg_exit_lock(&unlock); - - /* pointer stat must be valid, we don't need to check sanity */ - stat = idr_find(&sp_stat_idr, mm->sp_stat_id); + stat = sp_get_proc_stat(mm->sp_stat_id); /* * There are two basic scenarios when a process in the share pool is * exiting but its share pool memory usage is not 0. @@ -769,8 +777,11 @@ void sp_group_post_exit(struct mm_struct *mm) byte2kb(alloc_size), byte2kb(k2u_size)); }
+ down_write(&sp_stat_sem); idr_remove(&sp_stat_idr, mm->sp_stat_id); + up_write(&sp_stat_sem);
+ spg_exit_lock(&unlock); __sp_group_drop_locked(mm->sp_group); spg_exit_unlock(unlock);
@@ -1223,7 +1234,7 @@ int sp_free(unsigned long addr) if (current->mm == NULL) { atomic64_sub(spa->real_size, &kthread_stat.alloc_size); } else { - stat = idr_find(&sp_stat_idr, current->mm->sp_stat_id); + stat = sp_get_proc_stat(current->mm->sp_stat_id); if (stat) atomic64_sub(spa->real_size, &stat->alloc_size); else @@ -1466,7 +1477,7 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id)
mutex_lock(&sp_mutex); if (!IS_ERR(p)) { - stat = idr_find(&sp_stat_idr, current->mm->sp_stat_id); + stat = sp_get_proc_stat(current->mm->sp_stat_id); if (stat) atomic64_add(size_aligned, &stat->alloc_size); } @@ -2260,7 +2271,7 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp if (current->mm == NULL) { atomic64_sub(spa->real_size, &kthread_stat.k2u_size); } else { - stat = idr_find(&sp_stat_idr, current->mm->sp_stat_id); + stat = sp_get_proc_stat(current->mm->sp_stat_id); if (stat) atomic64_sub(spa->real_size, &stat->k2u_size); else @@ -2488,9 +2499,9 @@ struct sp_proc_stat *sp_get_proc_stat(int tgid) { struct sp_proc_stat *stat;
- mutex_lock(&sp_mutex); + down_read(&sp_stat_sem); stat = idr_find(&sp_stat_idr, tgid); - mutex_unlock(&sp_mutex); + up_read(&sp_stat_sem);
/* maybe NULL or not, we always return it */ return stat; @@ -2501,22 +2512,28 @@ int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, { struct sp_group *spg = NULL; struct sp_proc_stat *stat; + int spg_id, hugepage_failures;
mutex_lock(&sp_mutex); spg = __sp_find_spg(task->pid, SPG_ID_DEFAULT); if (spg_valid(spg)) { - stat = idr_find(&sp_stat_idr, task->mm->sp_stat_id); - if (!stat) { - mutex_unlock(&sp_mutex); + spg_id = spg->id; + hugepage_failures = spg->hugepage_failures; + mutex_unlock(&sp_mutex); + + /* eliminate potential ABBA deadlock */ + stat = sp_get_proc_stat(task->mm->sp_stat_id); + if (!stat) return 0; - } + /* print the file header */ seq_printf(m, "%-8s %-9s %-13s\n", "Group_ID", "SP_ALLOC", "HugePage Fail"); seq_printf(m, "%-8d %-9ld %-13d\n", - spg->id, + spg_id, byte2kb(atomic64_read(&stat->alloc_size)), - spg->hugepage_failures); + hugepage_failures); + return 0; } mutex_unlock(&sp_mutex);
@@ -2704,11 +2721,11 @@ static int idr_proc_stat_cb(int id, void *p, void *data) long sp_alloc_nsize, non_sp_res, sp_res, non_sp_shm;
mutex_lock(&sp_mutex); - if (!mmget_not_zero(mm)) - goto out_unlock; /* * a task which is the target of k2u(to task) but without adding to a * sp group should be handled correctly. + * No longer mmget_not_zero(mm) but a process (k2u to task) may have + * problem */ spg = __sp_find_spg(id, SPG_ID_DEFAULT); if (!spg_valid(spg)) { @@ -2740,7 +2757,6 @@ static int idr_proc_stat_cb(int id, void *p, void *data) sp_res, non_sp_res, page2kb(mm->total_vm), page2kb(total_rss), page2kb(shmem), non_sp_shm); - mmput(mm);
out_unlock: mutex_unlock(&sp_mutex); @@ -2761,7 +2777,11 @@ static int proc_stat_show(struct seq_file *seq, void *offset) "guard", "-", byte2kb(atomic64_read(&kthread_stat.alloc_size)), byte2kb(atomic64_read(&kthread_stat.k2u_size))); + + /* pay attention to potential ABBA deadlock */ + down_read(&sp_stat_sem); idr_for_each(&sp_stat_idr, idr_proc_stat_cb, seq); + up_read(&sp_stat_sem); return 0; }
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: perf bugzilla: 47462 CVE: NA
-------------------------------------------------
This commit aims to accomplish our main goal: remove the 'big lock' sp_mutex.
We introduce rw_semaphore sp_group_sem, it only protects the idr operations of sp_group_idr.
The critical sections originally protected by sp_mutex is divided into four main parts. 1. idr operations of sp_group_idr, now protected by sp_group_sem. 2. idr operations of sp_stat_idr, now protected by sp_stat_sem. 3. access of the non-atomic members of struct sp_group, now protected by rw_semphore spg->rw_lock. 4. access of the accounting members of struct sp_proc_stat, now they have been changed to atomic types.
All of these works have been done, and sp_mutex can be removed safely. However, we decide to reserve sp_mutex, it can be used for inter-group operations in the future.
Meanwhile, we eliminate ambiguity of spg_valid().
Currently macro spg_valid(spg) has two meanings: 1. spg is NULL. 2. spg is not NULL but spg is dead. This is not a good design as we can make it simple and clear.
In the new implementation, spg_valid() only represents the second meaning. It is especially beneficial for the down operation of spg->rw_lock before calling spg_valid() as spg should always be NOT NULL, so checking NULL is not needed in spg_valid().
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 232 ++++++++++++++++++++++++++---------------------- 1 file changed, 124 insertions(+), 108 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 2517e861c1fc6..fd5ad378cd3fa 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -51,7 +51,7 @@ #define AC_NONE 0 #define AC_SINGLE_OWNER 1
-#define spg_valid(spg) ((spg) && ((spg)->is_alive == true)) +#define spg_valid(spg) ((spg)->is_alive == true)
#define byte2kb(size) ((size) >> 10) #define byte2mb(size) ((size) >> 20) @@ -71,10 +71,13 @@ int sysctl_sp_debug_mode;
int sysctl_share_pool_map_lock_enable;
+/* for inter-group operations */ +static DEFINE_MUTEX(sp_mutex); + /* idr of all sp_groups */ static DEFINE_IDR(sp_group_idr); - -static DEFINE_MUTEX(sp_mutex); +/* rw semaphore for sp_group_idr */ +static DECLARE_RWSEM(sp_group_sem);
static BLOCKING_NOTIFIER_HEAD(sp_notifier_chain);
@@ -91,7 +94,7 @@ static DECLARE_RWSEM(sp_stat_sem); static struct sp_proc_stat kthread_stat = {0};
/* - * The caller must hold sp_mutex and ensure no concurrency problem + * The caller must ensure no concurrency problem * for task_struct and mm_struct. */ static struct sp_proc_stat *sp_init_proc_stat(struct task_struct *tsk, @@ -195,10 +198,15 @@ int sysctl_share_pool_hugepage_enable = 1;
static void free_sp_group(struct sp_group *spg);
+/* the caller make sure spg is not NULL */ static bool sp_group_get(struct sp_group *spg) { - if (spg_valid(spg) && atomic_inc_not_zero(&spg->use_count)) + down_read(&spg->rw_lock); + if (spg_valid(spg) && atomic_inc_not_zero(&spg->use_count)) { + up_read(&spg->rw_lock); return true; + } + up_read(&spg->rw_lock);
return false; } @@ -296,6 +304,7 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa,
static void free_sp_group_id(unsigned int spg_id) { + /* ida operation is protected by an internal spin_lock */ if ((spg_id >= SPG_ID_AUTO_MIN && spg_id <= SPG_ID_AUTO_MAX) || (spg_id >= SPG_ID_DVPP_PASS_THROUGH_MIN && spg_id <= SPG_ID_DVPP_PASS_THROUGH_MAX)) @@ -306,7 +315,9 @@ static void free_sp_group(struct sp_group *spg) { fput(spg->file); fput(spg->file_hugetlb); + down_write(&sp_group_sem); idr_remove(&sp_group_idr, spg->id); + up_write(&sp_group_sem); free_sp_group_id((unsigned int)spg->id); kfree(spg); } @@ -343,9 +354,9 @@ static struct sp_group *__sp_find_spg(int pid, int spg_id)
put_task_struct(tsk); } else { - mutex_lock(&sp_mutex); + down_read(&sp_group_sem); spg = idr_find(&sp_group_idr, spg_id); - mutex_unlock(&sp_mutex); + up_read(&sp_group_sem); }
return spg; @@ -358,12 +369,15 @@ int sp_group_id_by_pid(int pid)
check_interrupt_context();
- mutex_lock(&sp_mutex); spg = __sp_find_spg(pid, SPG_ID_DEFAULT); + if (!spg) + return -ENODEV; + + down_read(&spg->rw_lock); if (spg_valid(spg)) spg_id = spg->id; + up_read(&spg->rw_lock);
- mutex_unlock(&sp_mutex); return spg_id; } EXPORT_SYMBOL_GPL(sp_group_id_by_pid); @@ -375,7 +389,10 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id) int ret; char name[20];
+ down_read(&sp_group_sem); spg = idr_find(&sp_group_idr, spg_id); + up_read(&sp_group_sem); + if (!spg) { struct user_struct *user = NULL; int hsize_log = MAP_HUGE_2MB >> MAP_HUGE_SHIFT; @@ -392,7 +409,7 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id) atomic64_set(&spg->alloc_nsize, 0); atomic64_set(&spg->alloc_hsize, 0); atomic64_set(&spg->alloc_size, 0); - spg->is_alive = false; + spg->is_alive = true; spg->hugepage_failures = 0; spg->dvpp_multi_spaces = false; spg->owner = current->group_leader; @@ -402,8 +419,10 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id)
init_rwsem(&spg->rw_lock);
- ret = idr_alloc(&sp_group_idr, spg, spg_id, spg_id+1, + down_write(&sp_group_sem); + ret = idr_alloc(&sp_group_idr, spg, spg_id, spg_id + 1, GFP_KERNEL); + up_write(&sp_group_sem); if (ret < 0) { if (printk_ratelimit()) pr_err("share pool: create group idr alloc failed\n"); @@ -441,7 +460,9 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id) out_fput: fput(spg->file); out_idr: + down_write(&sp_group_sem); idr_remove(&sp_group_idr, spg_id); + up_write(&sp_group_sem); out_kfree: kfree(spg); return ERR_PTR(ret); @@ -484,12 +505,8 @@ static void sp_munmap_task_areas(struct mm_struct *mm, struct list_head *stop) /* The caller must hold sp_mutex. */ static void __sp_group_drop_locked(struct sp_group *spg) { - bool is_alive = spg->is_alive; - - if (atomic_dec_and_test(&spg->use_count)) { - BUG_ON(is_alive); + if (atomic_dec_and_test(&spg->use_count)) free_sp_group(spg); - } }
/** @@ -527,16 +544,25 @@ int sp_group_add_task(int pid, int spg_id) }
if (spg_id >= SPG_ID_AUTO_MIN && spg_id <= SPG_ID_AUTO_MAX) { - mutex_lock(&sp_mutex); + down_read(&sp_group_sem); spg = idr_find(&sp_group_idr, spg_id); + up_read(&sp_group_sem); + + if (!spg) { + if (printk_ratelimit()) + pr_err("share pool: spg %d hasn't been created\n", spg_id); + return -EINVAL; + } + + down_read(&spg->rw_lock); if (!spg_valid(spg)) { - mutex_unlock(&sp_mutex); + up_read(&spg->rw_lock); if (printk_ratelimit()) pr_err("share pool: task add group failed because group id %d " - "hasn't been create or dead\n", spg_id); + "is dead\n", spg_id); return -EINVAL; } - mutex_unlock(&sp_mutex); + up_read(&spg->rw_lock); }
if (spg_id == SPG_ID_AUTO) { @@ -629,8 +655,6 @@ int sp_group_add_task(int pid, int spg_id) mm->sp_group = spg;
down_write(&spg->rw_lock); - /* We reactive the spg even the spg exists already. */ - spg->is_alive = true; list_add_tail(&mm->sp_node, &spg->procs); /* * create mappings of existing shared memory segments into this @@ -721,37 +745,13 @@ int sp_group_add_task(int pid, int spg_id) } EXPORT_SYMBOL_GPL(sp_group_add_task);
-static void spg_exit_lock(bool *unlock) -{ - switch (mutex_trylock_recursive(&sp_mutex)) { - case MUTEX_TRYLOCK_RECURSIVE: - *unlock = false; - break; - case MUTEX_TRYLOCK_FAILED: - mutex_lock(&sp_mutex); - *unlock = true; - break; - case MUTEX_TRYLOCK_SUCCESS: - *unlock = true; - break; - default: - BUG(); - } -} - -static void spg_exit_unlock(bool unlock) -{ - if (unlock) - mutex_unlock(&sp_mutex); -} - void sp_group_post_exit(struct mm_struct *mm) { struct sp_proc_stat *stat; + struct sp_group *spg = mm->sp_group; long alloc_size, k2u_size; - bool unlock;
- if (!enable_ascend_share_pool || !mm->sp_group) + if (!spg || !enable_ascend_share_pool) return;
stat = sp_get_proc_stat(mm->sp_stat_id); @@ -781,9 +781,7 @@ void sp_group_post_exit(struct mm_struct *mm) idr_remove(&sp_stat_idr, mm->sp_stat_id); up_write(&sp_stat_sem);
- spg_exit_lock(&unlock); - __sp_group_drop_locked(mm->sp_group); - spg_exit_unlock(unlock); + __sp_group_drop_locked(spg);
kfree(stat); } @@ -1229,7 +1227,6 @@ int sp_free(unsigned long addr)
up_read(&spa->spg->rw_lock);
- mutex_lock(&sp_mutex); /* pointer stat may be invalid because of kthread buff_module_guard_work */ if (current->mm == NULL) { atomic64_sub(spa->real_size, &kthread_stat.alloc_size); @@ -1240,7 +1237,6 @@ int sp_free(unsigned long addr) else BUG(); } - mutex_unlock(&sp_mutex);
drop_spa: __sp_area_drop(spa); @@ -1346,22 +1342,23 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) spg = current->mm->sp_group; } else { /* other scenes */ if (spg_id != SPG_ID_DEFAULT) { - mutex_lock(&sp_mutex); + down_read(&sp_group_sem); /* the caller should be a member of the sp group */ if (spg != idr_find(&sp_group_idr, spg_id)) { - mutex_unlock(&sp_mutex); - goto out; + up_read(&sp_group_sem); + return ERR_PTR(-EINVAL); } - mutex_unlock(&sp_mutex); + up_read(&sp_group_sem); } }
+ down_read(&spg->rw_lock); if (!spg_valid(spg)) { - pr_err("share pool: sp alloc failed, spg is invalid\n"); - goto out; + up_read(&spg->rw_lock); + pr_err("share pool: sp alloc failed, spg is dead\n"); + return ERR_PTR(-ENODEV); }
- down_read(&spg->rw_lock); if (sp_flags & SP_HUGEPAGE) { file = spg->file_hugetlb; size_aligned = ALIGN(size, PMD_SIZE); @@ -1475,13 +1472,11 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) out: up_read(&spg->rw_lock);
- mutex_lock(&sp_mutex); if (!IS_ERR(p)) { stat = sp_get_proc_stat(current->mm->sp_stat_id); if (stat) atomic64_add(size_aligned, &stat->alloc_size); } - mutex_unlock(&sp_mutex);
/* this will free spa if mmap failed */ if (spa && !IS_ERR(spa)) @@ -1544,7 +1539,7 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, int hsize_log = MAP_HUGE_2MB >> MAP_HUGE_SHIFT; unsigned long addr, buf, offset;
- if (spg_valid(spa->spg)) { + if (spa->spg != NULL) { /* k2u to group */ file = spa_file(spa); } else { @@ -1703,7 +1698,7 @@ static bool vmalloc_area_clr_flag(struct sp_area *spa, unsigned long kva, unsign void *sp_make_share_k2u(unsigned long kva, unsigned long size, unsigned long sp_flags, int pid, int spg_id) { - void *uva = ERR_PTR(-ENODEV); + void *uva; struct sp_group *spg; struct sp_area *spa; unsigned long kva_aligned; @@ -1754,7 +1749,6 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, goto out_put_task; }
- mutex_lock(&sp_mutex); /* * Process statistics initialization. if the target process has been * added to a sp group, then stat will be returned immediately. @@ -1762,12 +1756,10 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, */ stat = sp_init_proc_stat(tsk, mm); if (IS_ERR(stat)) { - mutex_unlock(&sp_mutex); uva = stat; pr_err("share pool: init proc stat failed, ret %lx\n", PTR_ERR(stat)); - goto out_unlock; + goto out; } - mutex_unlock(&sp_mutex);
spg = __sp_find_spg(pid, SPG_ID_DEFAULT); if (spg == NULL) { @@ -1776,7 +1768,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, if (printk_ratelimit()) pr_err("share pool: k2task invalid spg id %d\n", spg_id); uva = ERR_PTR(-EINVAL); - goto out_unlock; + goto out; } spa = sp_alloc_area(size_aligned, sp_flags, NULL, SPA_TYPE_K2TASK); if (IS_ERR(spa)) { @@ -1785,7 +1777,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, "(potential no enough virtual memory when -75): %ld\n", PTR_ERR(spa)); uva = spa; - goto out_unlock; + goto out; }
if (!vmalloc_area_set_flag(spa, kva_aligned, VM_SHAREPOOL)) { @@ -1795,16 +1787,20 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, }
uva = sp_make_share_kva_to_task(kva_aligned, spa, mm); - } else if (spg_valid(spg)) { + goto accounting; + } + + down_read(&spg->rw_lock); + if (spg_valid(spg)) { /* k2u to group */ if (spg_id != SPG_ID_DEFAULT && spg_id != spg->id) { + up_read(&spg->rw_lock); if (printk_ratelimit()) pr_err("share pool: k2spg invalid spg id %d\n", spg_id); uva = ERR_PTR(-EINVAL); - goto out_unlock; + goto out; }
- down_read(&spg->rw_lock); if (enable_share_k2u_spg) spa = sp_alloc_area(size_aligned, sp_flags, spg, SPA_TYPE_K2SPG); else @@ -1817,10 +1813,11 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, "(potential no enough virtual memory when -75): %ld\n", PTR_ERR(spa)); uva = spa; - goto out_unlock; + goto out; }
if (!vmalloc_area_set_flag(spa, kva_aligned, VM_SHAREPOOL)) { + up_read(&spg->rw_lock); pr_err("share pool: %s: the kva %pK is not valid\n", __func__, (void *)kva_aligned); goto out_drop_spa; } @@ -1830,17 +1827,17 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, else uva = sp_make_share_kva_to_task(kva_aligned, spa, mm);
- up_read(&spg->rw_lock); } else { /* group is dead, return -ENODEV */ pr_err("share pool: failed to make k2u, sp group is dead\n"); + uva = ERR_PTR(-ENODEV); } + up_read(&spg->rw_lock);
+accounting: if (!IS_ERR(uva)) { - mutex_lock(&sp_mutex); uva = uva + (kva - kva_aligned); atomic64_add(size_aligned, &stat->k2u_size); - mutex_unlock(&sp_mutex); } else { /* associate vma and spa */ if (!vmalloc_area_clr_flag(spa, kva_aligned, VM_SHAREPOOL)) @@ -1850,7 +1847,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size,
out_drop_spa: __sp_area_drop(spa); -out_unlock: +out: mmput(mm); out_put_task: put_task_struct(tsk); @@ -2243,12 +2240,21 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp goto out_drop_area; }
+ if (unlikely(!spa->spg)) { + WARN(1, "share pool: unshare uva NULL spg pointer\n"); + ret = -EINVAL; + goto out_drop_area; + } + + down_read(&spa->spg->rw_lock); if (!spg_valid(spa->spg)) { + up_read(&spa->spg->rw_lock); if (printk_ratelimit()) pr_info("share pool: no need to unshare uva(to group), " - "spa doesn't belong to a sp group or group is dead\n"); + "sp group of spa is dead\n"); goto out_clr_flag; } + up_read(&spa->spg->rw_lock);
/* alway allow kthread and dvpp channel destroy procedure */ if (current->mm && current->mm->sp_group != spa->spg) { @@ -2266,7 +2272,6 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp
sp_dump_stack();
- mutex_lock(&sp_mutex); /* pointer stat may be invalid because of kthread buff_module_guard_work */ if (current->mm == NULL) { atomic64_sub(spa->real_size, &kthread_stat.k2u_size); @@ -2275,9 +2280,8 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp if (stat) atomic64_sub(spa->real_size, &stat->k2u_size); else - WARN(1, "share_pool: %s: null process stat\n", __func__); + WARN(1, "share pool: %s: null process stat\n", __func__); } - mutex_unlock(&sp_mutex);
out_clr_flag: /* deassociate vma and spa */ @@ -2453,17 +2457,21 @@ bool sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid) size > MMAP_SHARE_POOL_16G_SIZE) return false;
- mutex_lock(&sp_mutex); spg = __sp_find_spg(pid, SPG_ID_DEFAULT); + if (!spg) + return false; + + down_write(&spg->rw_lock); if (!spg_valid(spg) || spg->dvpp_multi_spaces == true) { - mutex_unlock(&sp_mutex); + up_write(&spg->rw_lock); return false; } spg->dvpp_va_start = start; spg->dvpp_size = size; spg->dvpp_multi_spaces = true; + up_write(&spg->rw_lock); + host_svm_sp_enable = true; - mutex_unlock(&sp_mutex);
return true; } @@ -2514,12 +2522,15 @@ int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, struct sp_proc_stat *stat; int spg_id, hugepage_failures;
- mutex_lock(&sp_mutex); spg = __sp_find_spg(task->pid, SPG_ID_DEFAULT); + if (!spg) + return 0; + + down_read(&spg->rw_lock); if (spg_valid(spg)) { spg_id = spg->id; hugepage_failures = spg->hugepage_failures; - mutex_unlock(&sp_mutex); + up_read(&spg->rw_lock);
/* eliminate potential ABBA deadlock */ stat = sp_get_proc_stat(task->mm->sp_stat_id); @@ -2535,7 +2546,7 @@ int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, hugepage_failures); return 0; } - mutex_unlock(&sp_mutex); + up_read(&spg->rw_lock);
return 0; } @@ -2555,12 +2566,16 @@ static void rb_spa_stat_show(struct seq_file *seq) atomic_inc(&spa->use_count); spin_unlock(&sp_area_lock);
- mutex_lock(&sp_mutex); - if (spg_valid(spa->spg)) - seq_printf(seq, "%-10d ", spa->spg->id); - else /* k2u for task or spg is dead */ + if (!spa->spg) /* k2u to task */ seq_printf(seq, "%-10s ", "None"); - mutex_unlock(&sp_mutex); + else { + down_read(&spa->spg->rw_lock); + if (spg_valid(spa->spg)) /* k2u to group */ + seq_printf(seq, "%-10d ", spa->spg->id); + else /* spg is dead */ + seq_printf(seq, "%-10s ", "Dead"); + up_read(&spa->spg->rw_lock); + }
seq_printf(seq, "%2s%-14lx %2s%-14lx %-13ld ", "0x", spa->va_start, @@ -2682,9 +2697,9 @@ void spg_overview_show(struct seq_file *seq) atomic_read(&spg_stat.spa_total_num)); }
- mutex_lock(&sp_mutex); + down_read(&sp_group_sem); idr_for_each(&sp_group_idr, idr_spg_stat_cb, seq); - mutex_unlock(&sp_mutex); + up_read(&sp_group_sem);
if (seq != NULL) seq_puts(seq, "\n"); @@ -2720,7 +2735,6 @@ static int idr_proc_stat_cb(int id, void *p, void *data) */ long sp_alloc_nsize, non_sp_res, sp_res, non_sp_shm;
- mutex_lock(&sp_mutex); /* * a task which is the target of k2u(to task) but without adding to a * sp group should be handled correctly. @@ -2728,6 +2742,10 @@ static int idr_proc_stat_cb(int id, void *p, void *data) * problem */ spg = __sp_find_spg(id, SPG_ID_DEFAULT); + if (!spg) + goto out; + + down_read(&spg->rw_lock); if (!spg_valid(spg)) { spg_id = 0; sp_alloc_nsize = 0; @@ -2737,6 +2755,7 @@ static int idr_proc_stat_cb(int id, void *p, void *data) sp_alloc_nsize = byte2kb(atomic64_read(&spg->alloc_nsize)); sp_res = byte2kb(atomic64_read(&spg->alloc_size)); } + up_read(&spg->rw_lock);
anon = get_mm_counter(mm, MM_ANONPAGES); file = get_mm_counter(mm, MM_FILEPAGES); @@ -2758,9 +2777,7 @@ static int idr_proc_stat_cb(int id, void *p, void *data) page2kb(mm->total_vm), page2kb(total_rss), page2kb(shmem), non_sp_shm);
-out_unlock: - mutex_unlock(&sp_mutex); - +out: return 0; }
@@ -2891,34 +2908,33 @@ EXPORT_SYMBOL(sharepool_no_page);
void sp_group_exit(struct mm_struct *mm) { - struct sp_group *spg = NULL; - bool is_alive = true, unlock; + struct sp_group *spg = mm->sp_group; + bool is_alive;
- if (!enable_ascend_share_pool) + if (!spg || !enable_ascend_share_pool) return;
- spg = mm->sp_group; - /* * Recall we add mm->users by 1 deliberately in sp_group_add_task(). * If the mm_users is 2, it means that the mm is ready to be freed * because the last owner of this mm is in exiting procedure: * do_exit() -> exit_mm() -> mmput() -> THIS function. */ + down_write(&spg->rw_lock); if (spg_valid(spg) && atomic_read(&mm->mm_users) == MM_WOULD_FREE) { - spg_exit_lock(&unlock); - down_write(&spg->rw_lock); if (list_is_singular(&spg->procs)) is_alive = spg->is_alive = false; - list_del(&mm->sp_node); + list_del(&mm->sp_node); /* affect spg->procs */ up_write(&spg->rw_lock); + if (!is_alive) blocking_notifier_call_chain(&sp_notifier_chain, 0, mm->sp_group); /* match with get_task_mm() in sp_group_add_task() */ atomic_dec(&mm->mm_users); - spg_exit_unlock(unlock); + return; } + up_write(&spg->rw_lock); }
struct page *sp_alloc_pages(struct vm_struct *area, gfp_t mask,
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: perf bugzilla: 47462 CVE: NA
-------------------------------------------------
__sp_group_drop_locked() actually doesn't need to be protected by any locks.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index fd5ad378cd3fa..2eaf0c06faf33 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -503,7 +503,7 @@ static void sp_munmap_task_areas(struct mm_struct *mm, struct list_head *stop) }
/* The caller must hold sp_mutex. */ -static void __sp_group_drop_locked(struct sp_group *spg) +static void sp_group_drop(struct sp_group *spg) { if (atomic_dec_and_test(&spg->use_count)) free_sp_group(spg); @@ -732,7 +732,7 @@ int sp_group_add_task(int pid, int spg_id)
out_drop_group: if (unlikely(ret)) - __sp_group_drop_locked(spg); + sp_group_drop(spg); out_put_mm: /* No need to put the mm if the sp group adds this mm successfully */ if (unlikely(ret)) @@ -781,7 +781,7 @@ void sp_group_post_exit(struct mm_struct *mm) idr_remove(&sp_stat_idr, mm->sp_stat_id); up_write(&sp_stat_sem);
- __sp_group_drop_locked(spg); + sp_group_drop(spg);
kfree(stat); } @@ -2922,6 +2922,7 @@ void sp_group_exit(struct mm_struct *mm) */ down_write(&spg->rw_lock); if (spg_valid(spg) && atomic_read(&mm->mm_users) == MM_WOULD_FREE) { + /* a dead group should NOT be reactive again */ if (list_is_singular(&spg->procs)) is_alive = spg->is_alive = false; list_del(&mm->sp_node); /* affect spg->procs */
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: doc bugzilla: 47462 CVE: NA
-------------------------------------------------
Update outdated comments.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 2eaf0c06faf33..1bf6df6e42417 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -322,7 +322,6 @@ static void free_sp_group(struct sp_group *spg) kfree(spg); }
-/* The caller must hold sp_mutex. */ static struct sp_group *__sp_find_spg(int pid, int spg_id) { struct sp_group *spg; @@ -382,7 +381,6 @@ int sp_group_id_by_pid(int pid) } EXPORT_SYMBOL_GPL(sp_group_id_by_pid);
-/* The caller must hold sp_mutex. */ static struct sp_group *find_or_alloc_sp_group(int spg_id) { struct sp_group *spg; @@ -470,7 +468,7 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id)
static void __sp_area_drop_locked(struct sp_area *spa);
-/* The caller must hold sp_mutex. */ +/* The caller must down_write(&mm->mmap_sem) */ static void sp_munmap_task_areas(struct mm_struct *mm, struct list_head *stop) { struct sp_area *spa, *prev = NULL; @@ -502,7 +500,6 @@ static void sp_munmap_task_areas(struct mm_struct *mm, struct list_head *stop) spin_unlock(&sp_area_lock); }
-/* The caller must hold sp_mutex. */ static void sp_group_drop(struct sp_group *spg) { if (atomic_dec_and_test(&spg->use_count)) @@ -818,8 +815,6 @@ static unsigned long cached_vstart; /* affected by SP_DVPP and sp_config_dvpp_r * Allocate a region of VA from the share pool. * @size - the size of VA to allocate * - * The caller must hold must sp_mutex when input parameter spg is not NULL - * * Return NULL if fail. */ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, @@ -1135,7 +1130,12 @@ static void sp_try_to_compact(void) sp_add_work_compact(); }
-/* The caller must hold sp_mutex. */ +/* + * The function calls of do_munmap() won't change any non-atomic member + * of struct sp_group. Please review the following chain: + * do_munmap -> remove_vma_list -> remove_vma -> sp_area_drop -> + * __sp_area_drop_locked -> sp_free_area + */ static void sp_munmap(struct mm_struct *mm, unsigned long addr, unsigned long size) { @@ -1152,7 +1152,6 @@ static void sp_munmap(struct mm_struct *mm, unsigned long addr, up_write(&mm->mmap_sem); }
-/* The caller must hold sp_mutex. */ static void __sp_free(struct sp_group *spg, unsigned long addr, unsigned long size, struct mm_struct *stop) { @@ -2657,7 +2656,7 @@ void spa_overview_show(struct seq_file *seq) } }
-/* the caller must hold sp_mutex */ +/* the caller must hold sp_group_sem */ static int idr_spg_stat_cb(int id, void *p, void *data) { struct sp_group *spg = p;
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: perf bugzilla: 47462 CVE: NA
-------------------------------------------------
After getting the pointer of sp_group spg by calling __sp_find_spg(), the memory of spg may be released if spg is dead and free_sp_group() is called.
To solve this problem, we increase the refcount of spg when call __sp_find_spg(). Users should call sp_group_drop() after finishing using it.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 131 +++++++++++++++++++++++++++++------------------- 1 file changed, 80 insertions(+), 51 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 1bf6df6e42417..7d50b55b80cae 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -196,21 +196,6 @@ static bool host_svm_sp_enable = false;
int sysctl_share_pool_hugepage_enable = 1;
-static void free_sp_group(struct sp_group *spg); - -/* the caller make sure spg is not NULL */ -static bool sp_group_get(struct sp_group *spg) -{ - down_read(&spg->rw_lock); - if (spg_valid(spg) && atomic_inc_not_zero(&spg->use_count)) { - up_read(&spg->rw_lock); - return true; - } - up_read(&spg->rw_lock); - - return false; -} - static unsigned long spa_size(struct sp_area *spa) { return spa->real_size; @@ -322,7 +307,8 @@ static void free_sp_group(struct sp_group *spg) kfree(spg); }
-static struct sp_group *__sp_find_spg(int pid, int spg_id) +/* user must call sp_group_drop() after use */ +static struct sp_group *__sp_find_spg_locked(int pid, int spg_id) { struct sp_group *spg; int ret = 0; @@ -347,20 +333,41 @@ static struct sp_group *__sp_find_spg(int pid, int spg_id) task_lock(tsk); if (tsk->mm == NULL) spg = NULL; - else + else { spg = tsk->mm->sp_group; + /* don't revive a dead group */ + if (!spg || !atomic_inc_not_zero(&spg->use_count)) + spg = NULL; + } task_unlock(tsk);
put_task_struct(tsk); } else { - down_read(&sp_group_sem); spg = idr_find(&sp_group_idr, spg_id); - up_read(&sp_group_sem); + /* don't revive a dead group */ + if (!spg || !atomic_inc_not_zero(&spg->use_count)) + spg = NULL; }
return spg; }
+static struct sp_group *__sp_find_spg(int pid, int spg_id) +{ + struct sp_group *spg; + + down_read(&sp_group_sem); + spg = __sp_find_spg_locked(pid, spg_id); + up_read(&sp_group_sem); + return spg; +} + +static void sp_group_drop(struct sp_group *spg) +{ + if (atomic_dec_and_test(&spg->use_count)) + free_sp_group(spg); +} + int sp_group_id_by_pid(int pid) { struct sp_group *spg; @@ -377,6 +384,7 @@ int sp_group_id_by_pid(int pid) spg_id = spg->id; up_read(&spg->rw_lock);
+ sp_group_drop(spg); return spg_id; } EXPORT_SYMBOL_GPL(sp_group_id_by_pid); @@ -387,9 +395,8 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id) int ret; char name[20];
- down_read(&sp_group_sem); - spg = idr_find(&sp_group_idr, spg_id); - up_read(&sp_group_sem); + down_write(&sp_group_sem); + spg = __sp_find_spg_locked(current->pid, spg_id);
if (!spg) { struct user_struct *user = NULL; @@ -401,6 +408,15 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id) pr_err("share pool: alloc spg failed due to lack of memory\n"); return ERR_PTR(-ENOMEM); } + ret = idr_alloc(&sp_group_idr, spg, spg_id, spg_id + 1, + GFP_KERNEL); + up_write(&sp_group_sem); + if (ret < 0) { + if (printk_ratelimit()) + pr_err("share pool: create group idr alloc failed\n"); + goto out_kfree; + } + spg->id = spg_id; atomic_set(&spg->spa_num, 0); atomic64_set(&spg->size, 0); @@ -417,16 +433,6 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id)
init_rwsem(&spg->rw_lock);
- down_write(&sp_group_sem); - ret = idr_alloc(&sp_group_idr, spg, spg_id, spg_id + 1, - GFP_KERNEL); - up_write(&sp_group_sem); - if (ret < 0) { - if (printk_ratelimit()) - pr_err("share pool: create group idr alloc failed\n"); - goto out_kfree; - } - sprintf(name, "sp_group_%d", spg_id); spg->file = shmem_kernel_file_setup(name, MAX_LFS_FILESIZE, VM_NORESERVE); @@ -449,8 +455,15 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id) goto out_fput; } } else { - if (!sp_group_get(spg)) + up_write(&sp_group_sem); + down_read(&spg->rw_lock); + if (!spg_valid(spg)) { + up_read(&spg->rw_lock); + sp_group_drop(spg); return ERR_PTR(-ENODEV); + } + up_read(&spg->rw_lock); + /* spg->use_count has increased due to __sp_find_spg() */ }
return spg; @@ -500,12 +513,6 @@ static void sp_munmap_task_areas(struct mm_struct *mm, struct list_head *stop) spin_unlock(&sp_area_lock); }
-static void sp_group_drop(struct sp_group *spg) -{ - if (atomic_dec_and_test(&spg->use_count)) - free_sp_group(spg); -} - /** * sp_group_add_task - add a process to an sp_group * @pid: the pid of the task to be added @@ -541,9 +548,7 @@ int sp_group_add_task(int pid, int spg_id) }
if (spg_id >= SPG_ID_AUTO_MIN && spg_id <= SPG_ID_AUTO_MAX) { - down_read(&sp_group_sem); - spg = idr_find(&sp_group_idr, spg_id); - up_read(&sp_group_sem); + spg = __sp_find_spg(pid, spg_id);
if (!spg) { if (printk_ratelimit()) @@ -557,9 +562,12 @@ int sp_group_add_task(int pid, int spg_id) if (printk_ratelimit()) pr_err("share pool: task add group failed because group id %d " "is dead\n", spg_id); + sp_group_drop(spg); return -EINVAL; } up_read(&spg->rw_lock); + + sp_group_drop(spg); }
if (spg_id == SPG_ID_AUTO) { @@ -778,6 +786,7 @@ void sp_group_post_exit(struct mm_struct *mm) idr_remove(&sp_stat_idr, mm->sp_stat_id); up_write(&sp_stat_sem);
+ /* match with sp_group_add_task -> find_or_alloc_sp_group */ sp_group_drop(spg);
kfree(stat); @@ -1286,12 +1295,12 @@ static unsigned long sp_mmap(struct mm_struct *mm, struct file *file, */ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) { - struct sp_group *spg = NULL; + struct sp_group *spg, *spg_tmp; struct sp_area *spa = NULL; struct sp_proc_stat *stat; unsigned long sp_addr; unsigned long mmap_addr; - void *p = ERR_PTR(-ENODEV); + void *p; /* return value */ struct mm_struct *mm; struct file *file; unsigned long size_aligned; @@ -1339,21 +1348,28 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) return ERR_PTR(ret); } spg = current->mm->sp_group; + /* + * increase use_count deliberately, due to __sp_find_spg is + * matched with sp_group_drop + */ + atomic_inc(&spg->use_count); } else { /* other scenes */ if (spg_id != SPG_ID_DEFAULT) { - down_read(&sp_group_sem); - /* the caller should be a member of the sp group */ - if (spg != idr_find(&sp_group_idr, spg_id)) { - up_read(&sp_group_sem); - return ERR_PTR(-EINVAL); + spg_tmp = __sp_find_spg(current->pid, spg_id); + if (spg != spg_tmp) { + sp_group_drop(spg); + if (spg_tmp) + sp_group_drop(spg_tmp); + return ERR_PTR(-ENODEV); } - up_read(&sp_group_sem); + sp_group_drop(spg_tmp); } }
down_read(&spg->rw_lock); if (!spg_valid(spg)) { up_read(&spg->rw_lock); + sp_group_drop(spg); pr_err("share pool: sp alloc failed, spg is dead\n"); return ERR_PTR(-ENODEV); } @@ -1481,6 +1497,8 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) if (spa && !IS_ERR(spa)) __sp_area_drop(spa);
+ sp_group_drop(spg); + sp_dump_stack(); sp_try_to_compact(); return p; @@ -1797,6 +1815,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, if (printk_ratelimit()) pr_err("share pool: k2spg invalid spg id %d\n", spg_id); uva = ERR_PTR(-EINVAL); + sp_group_drop(spg); goto out; }
@@ -1812,12 +1831,14 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, "(potential no enough virtual memory when -75): %ld\n", PTR_ERR(spa)); uva = spa; + sp_group_drop(spg); goto out; }
if (!vmalloc_area_set_flag(spa, kva_aligned, VM_SHAREPOOL)) { up_read(&spg->rw_lock); pr_err("share pool: %s: the kva %pK is not valid\n", __func__, (void *)kva_aligned); + sp_group_drop(spg); goto out_drop_spa; }
@@ -1832,6 +1853,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, uva = ERR_PTR(-ENODEV); } up_read(&spg->rw_lock); + sp_group_drop(spg);
accounting: if (!IS_ERR(uva)) { @@ -2472,6 +2494,7 @@ bool sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid)
host_svm_sp_enable = true;
+ sp_group_drop(spg); return true; } EXPORT_SYMBOL_GPL(sp_config_dvpp_range); @@ -2533,8 +2556,10 @@ int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns,
/* eliminate potential ABBA deadlock */ stat = sp_get_proc_stat(task->mm->sp_stat_id); - if (!stat) + if (unlikely(!stat)) { + sp_group_drop(spg); return 0; + }
/* print the file header */ seq_printf(m, "%-8s %-9s %-13s\n", @@ -2543,10 +2568,13 @@ int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, spg_id, byte2kb(atomic64_read(&stat->alloc_size)), hugepage_failures); + + sp_group_drop(spg); return 0; } up_read(&spg->rw_lock);
+ sp_group_drop(spg); return 0; }
@@ -2755,6 +2783,7 @@ static int idr_proc_stat_cb(int id, void *p, void *data) sp_res = byte2kb(atomic64_read(&spg->alloc_size)); } up_read(&spg->rw_lock); + sp_group_drop(spg);
anon = get_mm_counter(mm, MM_ANONPAGES); file = get_mm_counter(mm, MM_FILEPAGES);
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: perf bugzilla: 47462 CVE: NA
-------------------------------------------------
After getting the pointer of struct sp_proc_stat stat by calling sp_get_proc_stat(), the memory of stat may be released if target process exits.
To solve this problem, we increase the refcount of stat when call sp_get_proc_stat(). Users should call sp_proc_stat_drop() after finishing using it.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/share_pool.h | 7 +++ mm/oom_kill.c | 4 +- mm/share_pool.c | 120 +++++++++++++++++++++++++------------ 3 files changed, 91 insertions(+), 40 deletions(-)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 356781bfe3e0a..d94d48f57798c 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -109,6 +109,8 @@ struct sp_walk_data {
/* per process memory usage statistics indexed by tgid */ struct sp_proc_stat { + atomic_t use_count; + int tgid; struct mm_struct *mm; char comm[TASK_COMM_LEN]; /* @@ -170,6 +172,7 @@ extern int sp_unregister_notifier(struct notifier_block *nb); extern bool sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid); extern bool is_sharepool_addr(unsigned long addr); extern struct sp_proc_stat *sp_get_proc_stat(int tgid); +extern void sp_proc_stat_drop(struct sp_proc_stat *stat); extern void spa_overview_show(struct seq_file *seq); extern void spg_overview_show(struct seq_file *seq); extern void proc_sharepool_init(void); @@ -373,6 +376,10 @@ static inline struct sp_proc_stat *sp_get_proc_stat(int tgid) return NULL; }
+static inline void sp_proc_stat_drop(struct sp_proc_stat *stat) +{ +} + static inline void spa_overview_show(struct seq_file *seq) { } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 86db5d5508234..b10a38f58a55c 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -433,10 +433,12 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) task->tgid, task->mm->total_vm, get_mm_rss(task->mm)); if (!stat) pr_cont("%-9c %-9c ", '-', '-'); - else + else { pr_cont("%-9ld %-9ld ", /* byte to KB */ atomic64_read(&stat->alloc_size) >> 10, atomic64_read(&stat->k2u_size) >> 10); + sp_proc_stat_drop(stat); + } pr_cont("%8ld %8lu %5hd %s\n", mm_pgtables_bytes(task->mm), get_mm_counter(task->mm, MM_SWAPENTS), diff --git a/mm/share_pool.c b/mm/share_pool.c index 7d50b55b80cae..6ba479887f0da 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -87,47 +87,72 @@ static DEFINE_IDA(sp_group_id_ida);
/* idr of all sp_proc_stats */ static DEFINE_IDR(sp_stat_idr); -/* rw semaphore for sp_stat_idr */ +/* rw semaphore for sp_stat_idr and mm->sp_stat_id */ static DECLARE_RWSEM(sp_stat_sem);
/* for kthread buff_module_guard_work */ static struct sp_proc_stat kthread_stat = {0};
+/* The caller must hold sp_stat_sem */ +static struct sp_proc_stat *sp_get_proc_stat_locked(int tgid) +{ + struct sp_proc_stat *stat; + + stat = idr_find(&sp_stat_idr, tgid); + if (stat) + atomic_inc(&stat->use_count); + + /* maybe NULL or not, we always return it */ + return stat; +} + /* * The caller must ensure no concurrency problem * for task_struct and mm_struct. + * + * The user must call sp_proc_stat_drop() after use. */ static struct sp_proc_stat *sp_init_proc_stat(struct task_struct *tsk, struct mm_struct *mm) { struct sp_proc_stat *stat; - int id = mm->sp_stat_id; - int tgid = tsk->tgid; + int id, tgid = tsk->tgid; int ret;
+ down_write(&sp_stat_sem); + id = mm->sp_stat_id; if (id) { - stat = sp_get_proc_stat(id); /* other threads in the same process may have initialized it */ - if (stat) + stat = sp_get_proc_stat_locked(tgid); + if (stat) { + up_write(&sp_stat_sem); return stat; + } else { + /* if enter this branch, that's our mistake */ + pr_err("share pool: sp_init_proc_stat invalid id %d\n", id); + return ERR_PTR(-EBUSY); + } }
stat = kzalloc(sizeof(*stat), GFP_KERNEL); if (stat == NULL) { + up_write(&sp_stat_sem); if (printk_ratelimit()) pr_err("share pool: alloc proc stat failed due to lack of memory\n"); return ERR_PTR(-ENOMEM); }
+ /* use_count = 2: match with sp_proc_stat_drop */ + atomic_set(&stat->use_count, 2); atomic64_set(&stat->alloc_size, 0); atomic64_set(&stat->k2u_size, 0); + stat->tgid = tgid; stat->mm = mm; get_task_comm(stat->comm, tsk);
- down_write(&sp_stat_sem); ret = idr_alloc(&sp_stat_idr, stat, tgid, tgid + 1, GFP_KERNEL); - up_write(&sp_stat_sem); if (ret < 0) { + up_write(&sp_stat_sem); if (printk_ratelimit()) pr_err("share pool: proc stat idr alloc failed %d\n", ret); kfree(stat); @@ -135,6 +160,7 @@ static struct sp_proc_stat *sp_init_proc_stat(struct task_struct *tsk, }
mm->sp_stat_id = ret; + up_write(&sp_stat_sem); return stat; }
@@ -727,13 +753,10 @@ int sp_group_add_task(int pid, int spg_id) } up_write(&spg->rw_lock);
- if (unlikely(ret)) { - down_write(&sp_stat_sem); - idr_remove(&sp_stat_idr, mm->sp_stat_id); - up_write(&sp_stat_sem); - kfree(stat); - mm->sp_stat_id = 0; - } + /* double drop when fail: ensure release stat */ + if (unlikely(ret)) + sp_proc_stat_drop(stat); + sp_proc_stat_drop(stat); /* match with sp_init_proc_stat */
out_drop_group: if (unlikely(ret)) @@ -780,16 +803,15 @@ void sp_group_post_exit(struct mm_struct *mm) "It applied %ld aligned KB, k2u shared %ld aligned KB\n", stat->comm, mm->sp_stat_id, mm->sp_group->id, byte2kb(alloc_size), byte2kb(k2u_size)); - }
- down_write(&sp_stat_sem); - idr_remove(&sp_stat_idr, mm->sp_stat_id); - up_write(&sp_stat_sem); + /* match with sp_get_proc_stat in THIS function */ + sp_proc_stat_drop(stat); + /* match with sp_init_proc_stat, we expect stat is released after this call */ + sp_proc_stat_drop(stat); + }
/* match with sp_group_add_task -> find_or_alloc_sp_group */ sp_group_drop(spg); - - kfree(stat); }
/* the caller must hold sp_area_lock */ @@ -1240,9 +1262,10 @@ int sp_free(unsigned long addr) atomic64_sub(spa->real_size, &kthread_stat.alloc_size); } else { stat = sp_get_proc_stat(current->mm->sp_stat_id); - if (stat) + if (stat) { atomic64_sub(spa->real_size, &stat->alloc_size); - else + sp_proc_stat_drop(stat); + } else BUG(); }
@@ -1489,8 +1512,10 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id)
if (!IS_ERR(p)) { stat = sp_get_proc_stat(current->mm->sp_stat_id); - if (stat) + if (stat) { atomic64_add(size_aligned, &stat->alloc_size); + sp_proc_stat_drop(stat); + } }
/* this will free spa if mmap failed */ @@ -1769,13 +1794,12 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, /* * Process statistics initialization. if the target process has been * added to a sp group, then stat will be returned immediately. - * I believe there is no need to free stat in error handling branches. */ stat = sp_init_proc_stat(tsk, mm); if (IS_ERR(stat)) { uva = stat; pr_err("share pool: init proc stat failed, ret %lx\n", PTR_ERR(stat)); - goto out; + goto out_put_mm; }
spg = __sp_find_spg(pid, SPG_ID_DEFAULT); @@ -1785,7 +1809,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, if (printk_ratelimit()) pr_err("share pool: k2task invalid spg id %d\n", spg_id); uva = ERR_PTR(-EINVAL); - goto out; + goto out_drop_proc_stat; } spa = sp_alloc_area(size_aligned, sp_flags, NULL, SPA_TYPE_K2TASK); if (IS_ERR(spa)) { @@ -1794,7 +1818,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, "(potential no enough virtual memory when -75): %ld\n", PTR_ERR(spa)); uva = spa; - goto out; + goto out_drop_proc_stat; }
if (!vmalloc_area_set_flag(spa, kva_aligned, VM_SHAREPOOL)) { @@ -1815,8 +1839,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, if (printk_ratelimit()) pr_err("share pool: k2spg invalid spg id %d\n", spg_id); uva = ERR_PTR(-EINVAL); - sp_group_drop(spg); - goto out; + goto out_drop_spg; }
if (enable_share_k2u_spg) @@ -1831,14 +1854,12 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, "(potential no enough virtual memory when -75): %ld\n", PTR_ERR(spa)); uva = spa; - sp_group_drop(spg); - goto out; + goto out_drop_spg; }
if (!vmalloc_area_set_flag(spa, kva_aligned, VM_SHAREPOOL)) { up_read(&spg->rw_lock); pr_err("share pool: %s: the kva %pK is not valid\n", __func__, (void *)kva_aligned); - sp_group_drop(spg); goto out_drop_spa; }
@@ -1853,7 +1874,6 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, uva = ERR_PTR(-ENODEV); } up_read(&spg->rw_lock); - sp_group_drop(spg);
accounting: if (!IS_ERR(uva)) { @@ -1868,7 +1888,12 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size,
out_drop_spa: __sp_area_drop(spa); -out: +out_drop_spg: + if (spg) + sp_group_drop(spg); +out_drop_proc_stat: + sp_proc_stat_drop(stat); +out_put_mm: mmput(mm); out_put_task: put_task_struct(tsk); @@ -2298,9 +2323,10 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp atomic64_sub(spa->real_size, &kthread_stat.k2u_size); } else { stat = sp_get_proc_stat(current->mm->sp_stat_id); - if (stat) + if (stat) { atomic64_sub(spa->real_size, &stat->k2u_size); - else + sp_proc_stat_drop(stat); + } else WARN(1, "share pool: %s: null process stat\n", __func__); }
@@ -2525,18 +2551,33 @@ __setup("enable_sp_share_k2u_spg", enable_share_k2u_to_group);
/*** Statistical and maintenance functions ***/
+/* user must call sp_proc_stat_drop() after use */ struct sp_proc_stat *sp_get_proc_stat(int tgid) { struct sp_proc_stat *stat;
down_read(&sp_stat_sem); - stat = idr_find(&sp_stat_idr, tgid); + stat = sp_get_proc_stat_locked(tgid); up_read(&sp_stat_sem); - - /* maybe NULL or not, we always return it */ return stat; }
+static void free_sp_proc_stat(struct sp_proc_stat *stat) +{ + stat->mm->sp_stat_id = 0; + down_write(&sp_stat_sem); + idr_remove(&sp_stat_idr, stat->tgid); + up_write(&sp_stat_sem); + kfree(stat); +} + +/* the caller make sure stat is not NULL */ +void sp_proc_stat_drop(struct sp_proc_stat *stat) +{ + if (atomic_dec_and_test(&stat->use_count)) + free_sp_proc_stat(stat); +} + int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -2569,6 +2610,7 @@ int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, byte2kb(atomic64_read(&stat->alloc_size)), hugepage_failures);
+ sp_proc_stat_drop(stat); sp_group_drop(spg); return 0; }
From: Bixuan Cui cuibixuan@huawei.com
ascend inclusion category: feature bugzilla: NA CVE: NA
-------------------------------------------------
The share pool feature belongs to the memory subsystem. Therefore, the sysctl interface is more suitable to be placed in the vm_table.
Signed-off-by: Bixuan Cui cuibixuan@huawei.com Reviewed-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/sysctl.c | 42 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 22 deletions(-)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4d8d7c15a6ebc..90f71ecdec636 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1254,28 +1254,6 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, -#ifdef CONFIG_ASCEND_SHARE_POOL - { - /* 0: disable, 1: enable */ - .procname = "share_pool_hugepage_enable", - .data = &sysctl_share_pool_hugepage_enable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, - }, - { - /* 0: map_unlock, 1: map_lock */ - .procname = "share_pool_map_lock_enable", - .data = &sysctl_share_pool_map_lock_enable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, - }, -#endif { } };
@@ -1767,6 +1745,26 @@ static struct ctl_table vm_table[] = { .extra1 = &zero, .extra2 = &one, }, + { + /* 0: disable, 1: enable */ + .procname = "share_pool_hugepage_enable", + .data = &sysctl_share_pool_hugepage_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { + /* 0: map_unlock, 1: map_lock */ + .procname = "share_pool_map_lock_enable", + .data = &sysctl_share_pool_map_lock_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, #endif { } };
From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: NA CVE: NA
-------------------------------------------------
The is_vmalloc_sharepool agrument is defined of different type when the sharepool is turned off, it will occur the warning like this:
/include/linux/share_pool.h:462:20: note: expected ‘struct vm_struct *’ but argument is of type ‘long unsigned int’
Fix this warning.
Fixes: ad4504322d9e ("ascend: sharepool: don't enable the vmalloc to use hugepage default") Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/share_pool.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index d94d48f57798c..859efd3525f35 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -466,12 +466,12 @@ static inline void *buff_vzalloc_hugepage_user(unsigned long size) return NULL; }
-static inline bool is_vmalloc_huge(struct vm_struct *vm) +static inline bool is_vmalloc_huge(unsigned long vm_flags) { return NULL; }
-static inline bool is_vmalloc_sharepool(struct vm_struct *vm) +static inline bool is_vmalloc_sharepool(unsigned long vm_flags) { return NULL; }