From: Cui GaoSheng cuigaosheng1@huawei.com
hulk inclusion category: bugfix bugzilla: 186383 https://gitee.com/openeuler/kernel/issues/I4X1AI?from=project-issue CVE: NA backport: openEuler-22.03-LTS
--------------------------------
This reverts commit fcfdde9cfc503cfd191b7286f9c48077fb5bf420.
Signed-off-by: Cui GaoSheng cuigaosheng1@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/audit.c | 5 ----- 1 file changed, 5 deletions(-)
diff --git a/kernel/audit.c b/kernel/audit.c index 21be62bc8205..2a38cbaf3ddb 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -732,8 +732,6 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, if (!sk) { if (err_hook) (*err_hook)(skb); - if (queue == &audit_hold_queue) - goto out; continue; }
@@ -750,8 +748,6 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, (*err_hook)(skb); if (rc == -EAGAIN) rc = 0; - if (queue == &audit_hold_queue) - goto out; /* continue to drain the queue */ continue; } else @@ -763,7 +759,6 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, } }
-out: return (rc >= 0 ? 0 : rc); }
From: Paul Moore paul@paul-moore.com
mainline inclusion from mainline-v5.17-rc3 commit f26d04331360d42dbd6b58448bd98e4edbfbe1c5 category: bugfix bugzilla: 186383 https://gitee.com/openeuler/kernel/issues/I4X1AI?from=project-issue CVE: NA backport: openEuler-22.03-LTS
--------------------------------
When an admin enables audit at early boot via the "audit=1" kernel command line the audit queue behavior is slightly different; the audit subsystem goes to greater lengths to avoid dropping records, which unfortunately can result in problems when the audit daemon is forcibly stopped for an extended period of time.
This patch makes a number of changes designed to improve the audit queuing behavior so that leaving the audit daemon in a stopped state for an extended period does not cause a significant impact to the system.
- kauditd_send_queue() is now limited to looping through the passed queue only once per call. This not only prevents the function from looping indefinitely when records are returned to the current queue, it also allows any recovery handling in kauditd_thread() to take place when kauditd_send_queue() returns.
- Transient netlink send errors seen as -EAGAIN now cause the record to be returned to the retry queue instead of going to the hold queue. The intention of the hold queue is to store, perhaps for an extended period of time, the events which led up to the audit daemon going offline. The retry queue remains a temporary queue intended to protect against transient issues between the kernel and the audit daemon.
- The retry queue is now limited by the audit_backlog_limit setting, the same as the other queues. This allows admins to bound the size of all of the audit queues on the system.
- kauditd_rehold_skb() now returns records to the end of the hold queue to ensure ordering is preserved in the face of recent changes to kauditd_send_queue().
Cc: stable@vger.kernel.org Fixes: 5b52330bbfe63 ("audit: fix auditd/kernel connection state tracking") Fixes: f4b3ee3c85551 ("audit: improve robustness of the audit queue handling") Reported-by: Gaosheng Cui cuigaosheng1@huawei.com Tested-by: Gaosheng Cui cuigaosheng1@huawei.com Reviewed-by: Richard Guy Briggs rgb@redhat.com Signed-off-by: Paul Moore paul@paul-moore.com Signed-off-by: Cui GaoSheng cuigaosheng1@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/audit.c | 62 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 19 deletions(-)
diff --git a/kernel/audit.c b/kernel/audit.c index 2a38cbaf3ddb..aeec86ed4708 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -541,20 +541,22 @@ static void kauditd_printk_skb(struct sk_buff *skb) /** * kauditd_rehold_skb - Handle a audit record send failure in the hold queue * @skb: audit record + * @error: error code (unused) * * Description: * This should only be used by the kauditd_thread when it fails to flush the * hold queue. */ -static void kauditd_rehold_skb(struct sk_buff *skb) +static void kauditd_rehold_skb(struct sk_buff *skb, __always_unused int error) { - /* put the record back in the queue at the same place */ - skb_queue_head(&audit_hold_queue, skb); + /* put the record back in the queue */ + skb_queue_tail(&audit_hold_queue, skb); }
/** * kauditd_hold_skb - Queue an audit record, waiting for auditd * @skb: audit record + * @error: error code * * Description: * Queue the audit record, waiting for an instance of auditd. When this @@ -564,19 +566,31 @@ static void kauditd_rehold_skb(struct sk_buff *skb) * and queue it, if we have room. If we want to hold on to the record, but we * don't have room, record a record lost message. */ -static void kauditd_hold_skb(struct sk_buff *skb) +static void kauditd_hold_skb(struct sk_buff *skb, int error) { /* at this point it is uncertain if we will ever send this to auditd so * try to send the message via printk before we go any further */ kauditd_printk_skb(skb);
/* can we just silently drop the message? */ - if (!audit_default) { - kfree_skb(skb); - return; + if (!audit_default) + goto drop; + + /* the hold queue is only for when the daemon goes away completely, + * not -EAGAIN failures; if we are in a -EAGAIN state requeue the + * record on the retry queue unless it's full, in which case drop it + */ + if (error == -EAGAIN) { + if (!audit_backlog_limit || + skb_queue_len(&audit_retry_queue) < audit_backlog_limit) { + skb_queue_tail(&audit_retry_queue, skb); + return; + } + audit_log_lost("kauditd retry queue overflow"); + goto drop; }
- /* if we have room, queue the message */ + /* if we have room in the hold queue, queue the message */ if (!audit_backlog_limit || skb_queue_len(&audit_hold_queue) < audit_backlog_limit) { skb_queue_tail(&audit_hold_queue, skb); @@ -585,24 +599,32 @@ static void kauditd_hold_skb(struct sk_buff *skb)
/* we have no other options - drop the message */ audit_log_lost("kauditd hold queue overflow"); +drop: kfree_skb(skb); }
/** * kauditd_retry_skb - Queue an audit record, attempt to send again to auditd * @skb: audit record + * @error: error code (unused) * * Description: * Not as serious as kauditd_hold_skb() as we still have a connected auditd, * but for some reason we are having problems sending it audit records so * queue the given record and attempt to resend. */ -static void kauditd_retry_skb(struct sk_buff *skb) +static void kauditd_retry_skb(struct sk_buff *skb, __always_unused int error) { - /* NOTE: because records should only live in the retry queue for a - * short period of time, before either being sent or moved to the hold - * queue, we don't currently enforce a limit on this queue */ - skb_queue_tail(&audit_retry_queue, skb); + if (!audit_backlog_limit || + skb_queue_len(&audit_retry_queue) < audit_backlog_limit) { + skb_queue_tail(&audit_retry_queue, skb); + return; + } + + /* we have to drop the record, send it via printk as a last effort */ + kauditd_printk_skb(skb); + audit_log_lost("kauditd retry queue overflow"); + kfree_skb(skb); }
/** @@ -640,7 +662,7 @@ static void auditd_reset(const struct auditd_connection *ac) /* flush the retry queue to the hold queue, but don't touch the main * queue since we need to process that normally for multicast */ while ((skb = skb_dequeue(&audit_retry_queue))) - kauditd_hold_skb(skb); + kauditd_hold_skb(skb, -ECONNREFUSED); }
/** @@ -714,16 +736,18 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, struct sk_buff_head *queue, unsigned int retry_limit, void (*skb_hook)(struct sk_buff *skb), - void (*err_hook)(struct sk_buff *skb)) + void (*err_hook)(struct sk_buff *skb, int error)) { int rc = 0; - struct sk_buff *skb; + struct sk_buff *skb = NULL; + struct sk_buff *skb_tail; unsigned int failed = 0;
/* NOTE: kauditd_thread takes care of all our locking, we just use * the netlink info passed to us (e.g. sk and portid) */
- while ((skb = skb_dequeue(queue))) { + skb_tail = skb_peek_tail(queue); + while ((skb != skb_tail) && (skb = skb_dequeue(queue))) { /* call the skb_hook for each skb we touch */ if (skb_hook) (*skb_hook)(skb); @@ -731,7 +755,7 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, /* can we send to anyone via unicast? */ if (!sk) { if (err_hook) - (*err_hook)(skb); + (*err_hook)(skb, -ECONNREFUSED); continue; }
@@ -745,7 +769,7 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, rc == -ECONNREFUSED || rc == -EPERM) { sk = NULL; if (err_hook) - (*err_hook)(skb); + (*err_hook)(skb, rc); if (rc == -EAGAIN) rc = 0; /* continue to drain the queue */
From: Kefeng Wang wangkefeng.wang@huawei.com
mainline inclusion from mainline-v5.16-rc1 commit d0fe47c64152a63ceed4b9f29ac56371407fa7b4 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I57LS2 CVE: NA backport: openEuler-22.03-LTS
--------------------------------
After commit f227f0faf63b ("slub: fix unreclaimable slab stat for bulk free"), the check for free nonslab page is replaced by VM_BUG_ON_PAGE, which only check with CONFIG_DEBUG_VM enabled, but this config may impact performance, so it only for debug.
Commit 0937502af7c9 ("slub: Add check for kfree() of non slab objects.") add the ability, which should be needed in any configs to catch the invalid free, they even could be potential issue, eg, memory corruption, use after free and double free, so replace VM_BUG_ON_PAGE to WARN_ON_ONCE, add object address printing to help use to debug the issue.
Link: https://lkml.kernel.org/r/20210930070214.61499-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang wangkefeng.wang@huawei.com Cc: Matthew Wilcox willy@infradead.org Cc: Shakeel Butt shakeelb@google.com Cc: Vlastimil Babka vbabka@suse.cz Cc: Christoph Lameter cl@linux.com Cc: Pekka Enberg penberg@kernel.org Cc: David Rienjes rientjes@google.com Cc: Joonsoo Kim iamjoonsoo.kim@lge.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/slub.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/mm/slub.c b/mm/slub.c index 7a7b0bf82b8e..98452815a066 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3209,7 +3209,9 @@ static inline void free_nonslab_page(struct page *page, void *object) { unsigned int order = compound_order(page);
- VM_BUG_ON_PAGE(!PageCompound(page), page); + if (WARN_ON_ONCE(!PageCompound(page))) + pr_warn_once("object pointer: 0x%p\n", object); + kfree_hook(object); mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, -(PAGE_SIZE << order)); __free_pages(page, order);
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4XWBS
--------------------------------
This reverts commit d3d0ca13f958d359c80c8719b23cd7367eb881f2
We found that this patch may lead to a TLB conflicts abort. This may results by the changes from block<->table mappings. This problem may have something to do with the Break-Before-Make sequence rule but not yet clear.
Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/include/asm/kfence.h | 70 +-------------------------------- arch/arm64/mm/mmu.c | 6 ++- 2 files changed, 5 insertions(+), 71 deletions(-)
diff --git a/arch/arm64/include/asm/kfence.h b/arch/arm64/include/asm/kfence.h index 322e95bc228d..d061176d57ea 100644 --- a/arch/arm64/include/asm/kfence.h +++ b/arch/arm64/include/asm/kfence.h @@ -8,77 +8,9 @@ #ifndef __ASM_KFENCE_H #define __ASM_KFENCE_H
-#include <linux/kfence.h> #include <asm/cacheflush.h> -#include <asm/pgalloc.h>
-static inline int split_pud_page(pud_t *pud, unsigned long addr) -{ - int i; - pmd_t *pmd = pmd_alloc_one(&init_mm, addr); - unsigned long pfn = PFN_DOWN(__pa(addr)); - - if (!pmd) - return -ENOMEM; - - for (i = 0; i < PTRS_PER_PMD; i++) - set_pmd(pmd + i, pmd_mkhuge(pfn_pmd(pfn + i * PTRS_PER_PTE, PAGE_KERNEL))); - - smp_wmb(); /* See comment in __pte_alloc */ - pud_populate(&init_mm, pud, pmd); - - flush_tlb_kernel_range(addr, addr + PUD_SIZE); - return 0; -} - -static inline int split_pmd_page(pmd_t *pmd, unsigned long addr) -{ - int i; - pte_t *pte = pte_alloc_one_kernel(&init_mm); - unsigned long pfn = PFN_DOWN(__pa(addr)); - - if (!pte) - return -ENOMEM; - - for (i = 0; i < PTRS_PER_PTE; i++) - set_pte(pte + i, pfn_pte(pfn + i, PAGE_KERNEL)); - - smp_wmb(); /* See comment in __pte_alloc */ - pmd_populate_kernel(&init_mm, pmd, pte); - - flush_tlb_kernel_range(addr, addr + PMD_SIZE); - return 0; -} - -static inline bool arch_kfence_init_pool(void) -{ - unsigned long addr; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - - for (addr = (unsigned long)__kfence_pool; is_kfence_address((void *)addr); - addr += PAGE_SIZE) { - pgd = pgd_offset(&init_mm, addr); - if (pgd_leaf(*pgd)) - return false; - p4d = p4d_offset(pgd, addr); - if (p4d_leaf(*p4d)) - return false; - pud = pud_offset(p4d, addr); - if (pud_leaf(*pud)) { - if (split_pud_page(pud, addr & PUD_MASK)) - return false; - } - pmd = pmd_offset(pud, addr); - if (pmd_leaf(*pmd)) { - if (split_pmd_page(pmd, addr & PMD_MASK)) - return false; - } - } - return true; -} +static inline bool arch_kfence_init_pool(void) { return true; }
static inline bool kfence_protect_page(unsigned long addr, bool protect) { diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 3b2937ae5e98..7314a7a3613f 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -477,7 +477,8 @@ static void __init map_mem(pgd_t *pgdp) int flags = 0, eflags = 0; u64 i;
- if (rodata_full || debug_pagealloc_enabled()) + if (rodata_full || debug_pagealloc_enabled() || + IS_ENABLED(CONFIG_KFENCE)) flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
/* @@ -1486,7 +1487,8 @@ int arch_add_memory(int nid, u64 start, u64 size, * KFENCE requires linear map to be mapped at page granularity, so that * it is possible to protect/unprotect single pages in the KFENCE pool. */ - if (rodata_full || debug_pagealloc_enabled()) + if (rodata_full || debug_pagealloc_enabled() || + IS_ENABLED(CONFIG_KFENCE)) flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
From: Jisheng Zhang Jisheng.Zhang@synaptics.com
maillist inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4XWBS
Reference: https://lore.kernel.org/all/20210524172433.015b3b6b@xhacker.debian/
--------------------------------
Jisheng Zhang has another way of saving memory, we combine his two patches into one and made some adaptations with dynamic kfence objects.
Description of the original patch:
Some architectures may want to allocate the __kfence_pool differently for example, allocate the __kfence_pool earlier before paging_init(). We also delay the memset() to kfence_init_pool().
KFENCE requires linear map to be mapped at page granularity, so that it is possible to protect/unprotect single pages in the KFENCE pool. Currently if KFENCE is enabled, arm64 maps all pages at page granularity, it seems overkilled. In fact, we only need to map the pages in KFENCE pool itself at page granularity. We acchieve this goal by allocating KFENCE pool before paging_init() so we know the KFENCE pool address, then we take care to map the pool at page granularity during map_mem().
Signed-off-by: Jisheng Zhang Jisheng.Zhang@synaptics.com Conflicts: mm/kfence/core.c Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/include/asm/kfence.h | 8 +++++++- arch/arm64/kernel/setup.c | 3 +++ arch/arm64/mm/mmu.c | 33 +++++++++++++++++++++++++-------- include/linux/kfence.h | 2 ++ mm/kfence/core.c | 16 ++++++++++++++-- 5 files changed, 51 insertions(+), 11 deletions(-)
diff --git a/arch/arm64/include/asm/kfence.h b/arch/arm64/include/asm/kfence.h index d061176d57ea..64d7cbfe067b 100644 --- a/arch/arm64/include/asm/kfence.h +++ b/arch/arm64/include/asm/kfence.h @@ -8,9 +8,15 @@ #ifndef __ASM_KFENCE_H #define __ASM_KFENCE_H
+#include <linux/kfence.h> #include <asm/cacheflush.h>
-static inline bool arch_kfence_init_pool(void) { return true; } +static inline bool arch_kfence_init_pool(void) +{ + memset(__kfence_pool, 0, KFENCE_POOL_SIZE); + + return true; +}
static inline bool kfence_protect_page(unsigned long addr, bool protect) { diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index f6e56847a4e9..ddca8d27fca6 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -18,6 +18,7 @@ #include <linux/screen_info.h> #include <linux/init.h> #include <linux/kexec.h> +#include <linux/kfence.h> #include <linux/root_dev.h> #include <linux/cpu.h> #include <linux/interrupt.h> @@ -374,6 +375,8 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p)
arm64_memblock_init();
+ kfence_early_alloc_pool(); + efi_fake_memmap(); efi_find_mirror();
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 7314a7a3613f..56bfd692ebd9 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -13,6 +13,7 @@ #include <linux/init.h> #include <linux/ioport.h> #include <linux/kexec.h> +#include <linux/kfence.h> #include <linux/libfdt.h> #include <linux/mman.h> #include <linux/nodemask.h> @@ -477,10 +478,19 @@ static void __init map_mem(pgd_t *pgdp) int flags = 0, eflags = 0; u64 i;
- if (rodata_full || debug_pagealloc_enabled() || - IS_ENABLED(CONFIG_KFENCE)) + if (rodata_full || debug_pagealloc_enabled()) flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
+#ifdef CONFIG_KFENCE + /* + * KFENCE requires linear map to be mapped at page granularity, so + * temporarily skip mapping for __kfence_pool in the following + * for-loop + */ + if (__kfence_pool) + memblock_mark_nomap(__pa(__kfence_pool), KFENCE_POOL_SIZE); +#endif + /* * Take care not to create a writable alias for the * read-only text and rodata sections of the kernel image. @@ -553,6 +563,18 @@ static void __init map_mem(pgd_t *pgdp) resource_size(&crashk_res)); } #endif +#ifdef CONFIG_KFENCE + /* + * Map the __kfence_pool at page granularity now. + */ + if (__kfence_pool) { + __map_memblock(pgdp, __pa(__kfence_pool), + __pa(__kfence_pool + KFENCE_POOL_SIZE), + pgprot_tagged(PAGE_KERNEL), + NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS); + memblock_clear_nomap(__pa(__kfence_pool), KFENCE_POOL_SIZE); + } +#endif }
void mark_rodata_ro(void) @@ -1483,12 +1505,7 @@ int arch_add_memory(int nid, u64 start, u64 size, }
- /* - * KFENCE requires linear map to be mapped at page granularity, so that - * it is possible to protect/unprotect single pages in the KFENCE pool. - */ - if (rodata_full || debug_pagealloc_enabled() || - IS_ENABLED(CONFIG_KFENCE)) + if (rodata_full || debug_pagealloc_enabled()) flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start), diff --git a/include/linux/kfence.h b/include/linux/kfence.h index 4b5e3679a72c..00dee7459cd9 100644 --- a/include/linux/kfence.h +++ b/include/linux/kfence.h @@ -56,6 +56,7 @@ static __always_inline bool is_kfence_address(const void *addr) return unlikely((unsigned long)((char *)addr - __kfence_pool) < KFENCE_POOL_SIZE && __kfence_pool); }
+void __init kfence_early_alloc_pool(void); /** * kfence_alloc_pool() - allocate the KFENCE pool via memblock */ @@ -205,6 +206,7 @@ bool __must_check kfence_handle_page_fault(unsigned long addr, bool is_write, st #else /* CONFIG_KFENCE */
static inline bool is_kfence_address(const void *addr) { return false; } +static inline void kfence_early_alloc_pool(void) { } static inline void kfence_alloc_pool(void) { } static inline void kfence_init(void) { } static inline void kfence_shutdown_cache(struct kmem_cache *s) { } diff --git a/mm/kfence/core.c b/mm/kfence/core.c index a19154a8d196..370721509958 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -752,14 +752,26 @@ static void toggle_allocation_gate(struct work_struct *work) static DECLARE_DELAYED_WORK(kfence_timer, toggle_allocation_gate);
/* === Public interface ===================================================== */ +void __init kfence_early_alloc_pool(void) +{ + if (!kfence_sample_interval) + return; + + __kfence_pool = memblock_alloc_raw(KFENCE_POOL_SIZE, PAGE_SIZE); + + if (!__kfence_pool) { + kfence_sample_interval = 0; + pr_err("failed to early allocate pool, disable KFENCE\n"); + } +}
void __init kfence_alloc_pool(void) { if (!kfence_sample_interval) return;
- __kfence_pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE); - + if (!__kfence_pool) + __kfence_pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE); if (!__kfence_pool) pr_err("failed to allocate pool\n"); }
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: bugfix bugzilla: 186414, https://gitee.com/openeuler/kernel/issues/I53YXV CVE: NA
--------------------------------
Patch 1919867e8bad advanced the allocation of kfence_pool to setup_arch(). Since the macro module_param_cb is parsed after setup_arch(), it's invalid to set sample_interval and num_objects in cmdline. Add macro early_param to parse the cmdline to make it effective before the allocation of kfence_pool.
Fixes: 1919867e8bad ("arm64: remove page granularity limitation from KFENCE") Conflicts: mm/kfence/core.c Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/kfence/core.c | 13 +++++++++++++ 1 file changed, 13 insertions(+)
diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 370721509958..144e8da3101d 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -85,6 +85,19 @@ static const struct kernel_param_ops sample_interval_param_ops = { }; module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600);
+#ifdef CONFIG_ARM64 +static int __init parse_sample_interval(char *str) +{ + unsigned long num; + + if (kstrtoul(str, 0, &num) < 0) + return 0; + kfence_sample_interval = num; + return 0; +} +early_param("kfence.sample_interval", parse_sample_interval); +#endif + /* Pool usage% threshold when currently covered allocations are skipped. */ static unsigned long kfence_skip_covered_thresh __read_mostly = 75; module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644);
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4VZJT CVE: NA
--------------------------------
Before, when detect the cpu is overloaded, we throttle offline tasks at exit_to_user_mode_loop() before returning to user mode. Some architects(e.g.,arm64) do not support QOS scheduler because a task do not via exit_to_user_mode_loop() return to userspace at these platforms. In order to slove this problem and support qos scheduler on all architectures, if we require throttling offline tasks, we set flag TIF_NOTIFY_RESUME to an offline task when it is picked and throttle it at tracehook_notify_resume().
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/tracehook.h | 4 ++++ kernel/entry/common.c | 7 +------ kernel/sched/fair.c | 34 +++++++++++++++++++++++++++------- 3 files changed, 32 insertions(+), 13 deletions(-)
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index b480e1a07ed8..5913deb26219 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -196,6 +196,10 @@ static inline void tracehook_notify_resume(struct pt_regs *regs)
mem_cgroup_handle_over_high(); blkcg_maybe_throttle_current(); +#ifdef CONFIG_QOS_SCHED + sched_qos_offline_wait(); +#endif + }
#endif /* <linux/tracehook.h> */ diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 18a29ca01bfe..cea3957ebdbc 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -160,10 +160,6 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, if (ti_work & _TIF_SIGPENDING) arch_do_signal(regs);
-#ifdef CONFIG_QOS_SCHED - sched_qos_offline_wait(); -#endif - if (ti_work & _TIF_NOTIFY_RESUME) { tracehook_notify_resume(regs); rseq_handle_notify_resume(NULL, regs); @@ -198,8 +194,7 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs) /* Flush pending rcuog wakeup before the last need_resched() check */ rcu_nocb_flush_deferred_wakeup();
- if (unlikely((ti_work & EXIT_TO_USER_MODE_WORK) || - sched_qos_cpu_overload())) + if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) ti_work = exit_to_user_mode_loop(regs, ti_work);
arch_exit_to_user_mode_prepare(regs, ti_work); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f577d581166b..e85ae008e9da 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -26,6 +26,7 @@ #endif #ifdef CONFIG_QOS_SCHED #include <linux/delay.h> +#include <linux/tracehook.h> #endif
/* @@ -7178,6 +7179,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ }
#ifdef CONFIG_QOS_SCHED + static void start_qos_hrtimer(int cpu); static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) { @@ -7342,15 +7344,11 @@ void sched_qos_offline_wait(void) rcu_read_lock(); qos_level = task_group(current)->qos_level; rcu_read_unlock(); - if (qos_level != -1 || signal_pending(current)) + if (qos_level != -1 || fatal_signal_pending(current)) break; - msleep_interruptible(sysctl_offline_wait_interval); - } -}
-int sched_qos_cpu_overload(void) -{ - return __this_cpu_read(qos_cpu_overload); + schedule_timeout_killable(msecs_to_jiffies(sysctl_offline_wait_interval)); + } }
static enum hrtimer_restart qos_overload_timer_handler(struct hrtimer *timer) @@ -7383,6 +7381,23 @@ void init_qos_hrtimer(int cpu) hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); hrtimer->function = qos_overload_timer_handler; } + +/* + * To avoid Priority inversion issues, when this cpu is qos_cpu_overload, + * we should schedule offline tasks to run so that they can leave kernel + * critical sections, and throttle them before returning to user mode. + */ +static void qos_schedule_throttle(struct task_struct *p) +{ + if (unlikely(current->flags & PF_KTHREAD)) + return; + + if (unlikely(this_cpu_read(qos_cpu_overload))) { + if (task_group(p)->qos_level < 0) + set_notify_resume(p); + } +} + #endif
#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER @@ -7690,9 +7705,14 @@ done: __maybe_unused;
update_misfit_status(p, rq);
+#ifdef CONFIG_QOS_SCHED + qos_schedule_throttle(p); +#endif + #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER qos_smt_expel(this_cpu, p); #endif + return p;
idle:
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4VZJT CVE: NA
--------------------------------
1. Qos throttle reuse tg_{throttle,unthrottle}_{up,down} that can write some cfs-bandwidth fields, it may cause some unknown data error. So add qos_tg_{throttle,unthrottle}_{up,down} for qos throttle.
2. walk_tg_tree_from() caller must hold rcu_lock, currently there is none, so add it now.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/fair.c | 39 +++++++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 10 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e85ae008e9da..c51ed509b45d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5460,6 +5460,10 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
lockdep_assert_held(&rq->lock);
+#ifdef CONFIG_QOS_SCHED + unthrottle_qos_cfs_rqs(cpu_of(rq)); +#endif + rcu_read_lock(); list_for_each_entry_rcu(tg, &task_groups, list) { struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; @@ -5482,10 +5486,6 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) unthrottle_cfs_rq(cfs_rq); } rcu_read_unlock(); - -#ifdef CONFIG_QOS_SCHED - unthrottle_qos_cfs_rqs(cpu_of(rq)); -#endif }
#else /* CONFIG_CFS_BANDWIDTH */ @@ -7181,6 +7181,27 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ #ifdef CONFIG_QOS_SCHED
static void start_qos_hrtimer(int cpu); + +static int qos_tg_unthrottle_up(struct task_group *tg, void *data) +{ + struct rq *rq = data; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + + cfs_rq->throttle_count--; + + return 0; +} + +static int qos_tg_throttle_down(struct task_group *tg, void *data) +{ + struct rq *rq = data; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + + cfs_rq->throttle_count++; + + return 0; +} + static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); @@ -7192,7 +7213,7 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
/* freeze hierarchy runnable averages while throttled */ rcu_read_lock(); - walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); + walk_tg_tree_from(cfs_rq->tg, qos_tg_throttle_down, tg_nop, (void *)rq); rcu_read_unlock();
task_delta = cfs_rq->h_nr_running; @@ -7223,7 +7244,6 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) start_qos_hrtimer(cpu_of(rq));
cfs_rq->throttled = 1; - cfs_rq->throttled_clock = rq_clock(rq);
list_add(&cfs_rq->qos_throttled_list, &per_cpu(qos_throttled_cfs_rq, cpu_of(rq))); @@ -7232,7 +7252,6 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; int enqueue = 1; unsigned int prev_nr = cfs_rq->h_nr_running; @@ -7243,12 +7262,12 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->throttled = 0;
update_rq_clock(rq); - - cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; list_del_init(&cfs_rq->qos_throttled_list);
/* update hierarchical throttle state */ - walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); + rcu_read_lock(); + walk_tg_tree_from(cfs_rq->tg, tg_nop, qos_tg_unthrottle_up, (void *)rq); + rcu_read_unlock();
if (!cfs_rq->load.weight) return;