From: xulei stone.xulei@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4NP0K CVE: NA -------------------
fix losing SMI problem
Signed-off-by: xulei stone.xulei@huawei.com Signed-off-by: Jingyi Wang wangjingyi11@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/kvm/x86.c | 5 +++++ 1 file changed, 5 insertions(+)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 12db47c8bd3f..e33414f36dba 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4227,6 +4227,11 @@ static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu) { + if (is_smm(vcpu)) { + vcpu->arch.hflags &= ~HF_SMM_MASK; + vcpu->arch.smi_pending = 0; + } + kvm_make_request(KVM_REQ_SMI, vcpu);
return 0;
From: Wanpeng Li wanpengli@tencent.com
mainline inclusion from mainline-v5.12-rc4 commit c2162e13d6e2f43e5001a356196871642de070ba category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4NP0K CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------
In order to deal with noncoherent DMA, we should execute wbinvd on all dirty pCPUs when guest wbinvd exits to maintain data consistency. smp_call_function_many() does not execute the provided function on the local core, therefore replace it by on_each_cpu_mask().
Reported-by: Nadav Amit namit@vmware.com Cc: Nadav Amit namit@vmware.com Signed-off-by: Wanpeng Li wanpengli@tencent.com Message-Id: 1615517151-7465-1-git-send-email-wanpengli@tencent.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Jingyi Wang wangjingyi11@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e33414f36dba..d5dc43b8cae1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6655,7 +6655,7 @@ static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) int cpu = get_cpu();
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); - smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, + on_each_cpu_mask(vcpu->arch.wbinvd_dirty_mask, wbinvd_ipi, NULL, 1); put_cpu(); cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
From: Wanpeng Li wanpengli@tencent.com
mainline inclusion from mainline-v5.13-rc6 commit e898da784aed0ea65f7672d941c01dc9b79e6299 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4NP0K CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------
According to the SDM 10.5.4.1:
A write of 0 to the initial-count register effectively stops the local APIC timer, in both one-shot and periodic mode.
However, the lapic timer oneshot/periodic mode which is emulated by vmx-preemption timer doesn't stop by writing 0 to TMICT since vmx->hv_deadline_tsc is still programmed and the guest will receive the spurious timer interrupt later. This patch fixes it by also cancelling the vmx-preemption timer when writing 0 to the initial-count register.
Reviewed-by: Sean Christopherson seanjc@google.com Signed-off-by: Wanpeng Li wanpengli@tencent.com Message-Id: 1623050385-100988-1-git-send-email-wanpengli@tencent.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Jingyi Wang wangjingyi11@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/kvm/lapic.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 677d21082454..94e75c941b47 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1492,6 +1492,15 @@ static void limit_periodic_timer_frequency(struct kvm_lapic *apic)
static void cancel_hv_timer(struct kvm_lapic *apic);
+static void cancel_apic_timer(struct kvm_lapic *apic) +{ + hrtimer_cancel(&apic->lapic_timer.timer); + preempt_disable(); + if (apic->lapic_timer.hv_timer_in_use) + cancel_hv_timer(apic); + preempt_enable(); +} + static void apic_update_lvtt(struct kvm_lapic *apic) { u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) & @@ -1500,11 +1509,7 @@ static void apic_update_lvtt(struct kvm_lapic *apic) if (apic->lapic_timer.timer_mode != timer_mode) { if (apic_lvtt_tscdeadline(apic) != (timer_mode == APIC_LVT_TIMER_TSCDEADLINE)) { - hrtimer_cancel(&apic->lapic_timer.timer); - preempt_disable(); - if (apic->lapic_timer.hv_timer_in_use) - cancel_hv_timer(apic); - preempt_enable(); + cancel_apic_timer(apic); kvm_lapic_set_reg(apic, APIC_TMICT, 0); apic->lapic_timer.period = 0; apic->lapic_timer.tscdeadline = 0; @@ -2082,7 +2087,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) if (apic_lvtt_tscdeadline(apic)) break;
- hrtimer_cancel(&apic->lapic_timer.timer); + cancel_apic_timer(apic); kvm_lapic_set_reg(apic, APIC_TMICT, val); start_apic_timer(apic); break;
From: Sean Christopherson seanjc@google.com
mainline inclusion from mainline-v5.14-rc1 commit 0aa1837533e5f4be8cc21bbc06314c23ba2c5447 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4NP0K CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------
Reset the MMU context at vCPU INIT (and RESET for good measure) if CR0.PG was set prior to INIT. Simply re-initializing the current MMU is not sufficient as the current root HPA may not be usable in the new context. E.g. if TDP is disabled and INIT arrives while the vCPU is in long mode, KVM will fail to switch to the 32-bit pae_root and bomb on the next VM-Enter due to running with a 64-bit CR3 in 32-bit mode.
This bug was papered over in both VMX and SVM, but still managed to rear its head in the MMU role on VMX. Because EFER.LMA=1 requires CR0.PG=1, kvm_calc_shadow_mmu_root_page_role() checks for EFER.LMA without first checking CR0.PG. VMX's RESET/INIT flow writes CR0 before EFER, and so an INIT with the vCPU in 64-bit mode will cause the hack-a-fix to generate the wrong MMU role.
In VMX, the INIT issue is specific to running without unrestricted guest since unrestricted guest is available if and only if EPT is enabled. Commit 8668a3c468ed ("KVM: VMX: Reset mmu context when entering real mode") resolved the issue by forcing a reset when entering emulated real mode.
In SVM, commit ebae871a509d ("kvm: svm: reset mmu on VCPU reset") forced a MMU reset on every INIT to workaround the flaw in common x86. Note, at the time the bug was fixed, the SVM problem was exacerbated by a complete lack of a CR4 update.
The vendor resets will be reverted in future patches, primarily to aid bisection in case there are non-INIT flows that rely on the existing VMX logic.
Because CR0.PG is unconditionally cleared on INIT, and because CR0.WP and all CR4/EFER paging bits are ignored if CR0.PG=0, simply checking that CR0.PG was '1' prior to INIT/RESET is sufficient to detect a required MMU context reset.
Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson seanjc@google.com Message-Id: 20210622175739.3610207-4-seanjc@google.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Jingyi Wang wangjingyi11@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/kvm/x86.c | 13 +++++++++++++ 1 file changed, 13 insertions(+)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d5dc43b8cae1..05f41234687a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10202,6 +10202,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { + unsigned long old_cr0 = kvm_read_cr0(vcpu); + kvm_lapic_reset(vcpu, init_event);
vcpu->arch.hflags = 0; @@ -10270,6 +10272,17 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.ia32_xss = 0;
kvm_x86_ops.vcpu_reset(vcpu, init_event); + + /* + * Reset the MMU context if paging was enabled prior to INIT (which is + * implied if CR0.PG=1 as CR0 will be '0' prior to RESET). Unlike the + * standard CR0/CR4/EFER modification paths, only CR0.PG needs to be + * checked because it is unconditionally cleared on INIT and all other + * paging related bits are ignored if paging is disabled, i.e. CR0.WP, + * CR4, and EFER changes are all irrelevant if CR0.PG was '0'. + */ + if (old_cr0 & X86_CR0_PG) + kvm_mmu_reset_context(vcpu); }
void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
From: Wanpeng Li wanpengli@tencent.com
mainline inclusion from mainline-v5.14-rc1 commit 2735886c9ef115fc7b40d27bfe73605c38e9d56b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4NP0K CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------
KVM_GET_LAPIC stores the current value of TMCCT and KVM_SET_LAPIC's memcpy stores it in vcpu->arch.apic->regs, KVM_SET_LAPIC could store zero in vcpu->arch.apic->regs after it uses it, and then the stored value would always be zero. In addition, the TMCCT is always computed on-demand and never directly readable.
Suggested-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Wanpeng Li wanpengli@tencent.com Message-Id: 1623223000-18116-1-git-send-email-wanpengli@tencent.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Jingyi Wang wangjingyi11@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/kvm/lapic.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 94e75c941b47..70dcb723a0f9 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2615,6 +2615,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); update_divide_count(apic); __start_apic_timer(apic, APIC_TMCCT); + kvm_lapic_set_reg(apic, APIC_TMCCT, 0); kvm_apic_update_apicv(vcpu); apic->highest_isr_cache = -1; if (vcpu->arch.apicv_active) {
From: Marc Zyngier maz@kernel.org
mainline inclusion from mainline-v5.11-rc1 commit 14bda7a927336055d7c0deb1483f9cdb687c2080 bugzilla: https://gitee.com/openeuler/kernel/issues/I4NP0K CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------
There are a number of places where we check for the KVM_ARM_VCPU_PMU_V3 feature. Wrap this check into a new kvm_vcpu_has_pmu(), and use it at the existing locations.
No functional change.
Reviewed-by: Alexandru Elisei alexandru.elisei@arm.com Signed-off-by: Marc Zyngier maz@kernel.org Signed-off-by: Jingyi Wang wangjingyi11@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/include/asm/kvm_host.h | 3 +++ arch/arm64/kvm/pmu-emul.c | 8 +++----- 2 files changed, 6 insertions(+), 5 deletions(-)
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 37aba086c179..fcd8cc7d3cb0 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -700,6 +700,9 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu); #define kvm_arm_vcpu_sve_finalized(vcpu) \ ((vcpu)->arch.flags & KVM_ARM64_VCPU_SVE_FINALIZED)
+#define kvm_vcpu_has_pmu(vcpu) \ + (test_bit(KVM_ARM_VCPU_PMU_V3, (vcpu)->arch.features)) + #ifdef CONFIG_ARM64_TWED #define use_twed() (has_twed() && twed_enable) extern bool twed_enable; diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 4b30260e1abf..a2a20f358f13 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -918,8 +918,7 @@ static bool pmu_irq_is_valid(struct kvm *kvm, int irq)
int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { - if (!kvm_arm_support_pmu_v3() || - !test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) + if (!kvm_arm_support_pmu_v3() || !kvm_vcpu_has_pmu(vcpu)) return -ENODEV;
if (vcpu->arch.pmu.created) @@ -1020,7 +1019,7 @@ int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) if (!irqchip_in_kernel(vcpu->kvm)) return -EINVAL;
- if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) + if (!kvm_vcpu_has_pmu(vcpu)) return -ENODEV;
if (!kvm_arm_pmu_irq_initialized(vcpu)) @@ -1040,8 +1039,7 @@ int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) case KVM_ARM_VCPU_PMU_V3_IRQ: case KVM_ARM_VCPU_PMU_V3_INIT: case KVM_ARM_VCPU_PMU_V3_FILTER: - if (kvm_arm_support_pmu_v3() && - test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) + if (kvm_arm_support_pmu_v3() && kvm_vcpu_has_pmu(vcpu)) return 0; }
From: Alexandru Elisei alexandru.elisei@arm.com
mainline inclusion from mainline-v5.11-rc1 commit 9bbfa4b565379eeb2fb8fdbcc9979549ae0e48d9 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4NP0K CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------
When enabling the PMU in kvm_arm_pmu_v3_enable(), KVM returns early if the PMU flag created is false and skips any other checks. Because PMU emulation is gated only on the VCPU feature being set, this makes it possible for userspace to get away with setting the VCPU feature but not doing any initialization for the PMU. Fix it by returning an error when trying to run the VCPU if the PMU hasn't been initialized correctly.
The PMU is marked as created only if the interrupt ID has been set when using an in-kernel irqchip. This means the same check in kvm_arm_pmu_v3_enable() is redundant, remove it.
Signed-off-by: Alexandru Elisei alexandru.elisei@arm.com Signed-off-by: Marc Zyngier maz@kernel.org Link: https://lore.kernel.org/r/20201126144916.164075-1-alexandru.elisei@arm.com Signed-off-by: Jingyi Wang wangjingyi11@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/kvm/pmu-emul.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index a2a20f358f13..00bf8733e718 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -830,9 +830,12 @@ bool kvm_arm_support_pmu_v3(void)
int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) { - if (!vcpu->arch.pmu.created) + if (!kvm_vcpu_has_pmu(vcpu)) return 0;
+ if (!vcpu->arch.pmu.created) + return -EINVAL; + /* * A valid interrupt configuration for the PMU is either to have a * properly configured interrupt number and using an in-kernel @@ -840,9 +843,6 @@ int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) */ if (irqchip_in_kernel(vcpu->kvm)) { int irq = vcpu->arch.pmu.irq_num; - if (!kvm_arm_pmu_irq_initialized(vcpu)) - return -EINVAL; - /* * If we are using an in-kernel vgic, at this point we know * the vgic will be initialized, so we can check the PMU irq
From: Marc Zyngier maz@kernel.org
mainline inclusion from mainline-v5.14-rc1 commit d0c94c49792cf780cbfefe29f81bb8c3b73bc76b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4NP0K CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------
Restoring a guest with an active virtual PMU results in no perf counters being instanciated on the host side. Not quite what you'd expect from a restore.
In order to fix this, force a writeback of PMCR_EL0 on the first run of a vcpu (using a new request so that it happens once the vcpu has been loaded). This will in turn create all the host-side counters that were missing.
Reported-by: Jinank Jain jinankj@amazon.de Tested-by: Jinank Jain jinankj@amazon.de Signed-off-by: Marc Zyngier maz@kernel.org Link: https://lore.kernel.org/r/87wnrbylxv.wl-maz@kernel.org Link: https://lore.kernel.org/r/b53dfcf9bbc4db7f96154b1cd5188d72b9766358.camel@ama... Signed-off-by: Jingyi Wang wangjingyi11@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kvm/arm.c | 4 ++++ arch/arm64/kvm/pmu-emul.c | 3 +++ 3 files changed, 8 insertions(+)
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index fcd8cc7d3cb0..93cdc5a2fc97 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -47,6 +47,7 @@ #define KVM_REQ_VCPU_RESET KVM_ARCH_REQ(2) #define KVM_REQ_RECORD_STEAL KVM_ARCH_REQ(3) #define KVM_REQ_RELOAD_GICv4 KVM_ARCH_REQ(4) +#define KVM_REQ_RELOAD_PMU KVM_ARCH_REQ(5)
#define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ KVM_DIRTY_LOG_INITIALLY_SET) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 7f64131641a4..3ae13ef0c980 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -713,6 +713,10 @@ static void check_vcpu_requests(struct kvm_vcpu *vcpu) vgic_v4_load(vcpu); preempt_enable(); } + + if (kvm_check_request(KVM_REQ_RELOAD_PMU, vcpu)) + kvm_pmu_handle_pmcr(vcpu, + __vcpu_sys_reg(vcpu, PMCR_EL0)); } }
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 00bf8733e718..597d0ed84086 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -858,6 +858,9 @@ int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) kvm_pmu_vcpu_reset(vcpu); vcpu->arch.pmu.ready = true;
+ /* One-off reload of the PMU on first run */ + kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); + return 0; }
From: Xingang Wang wangxingang5@huawei.com
stable inclusion category: feature from stable-5.13-rc1 commit 18d731242d5c67c0783126c42d3f85870cec2df5 bugzilla: https://gitee.com/openeuler/kernel/issues/I4NR4D
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
-------------------------------------------------
This can fail, and seems to be a popular target for syzkaller error injection. Check the error return and unwind with put_device().
Fixes: 7b96953bc640 ("vfio: Mediated device Core driver") Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Kevin Tian kevin.tian@intel.com Reviewed-by: Max Gurtovoy mgurtovoy@nvidia.com Reviewed-by: Cornelia Huck cohuck@redhat.com Signed-off-by: Jason Gunthorpe jgg@nvidia.com Message-Id: 9-v2-d36939638fc6+d54-vfio2_jgg@nvidia.com Signed-off-by: Alex Williamson alex.williamson@redhat.com Signed-off-by: Xingang Wang wangxingang5@huawei.com Reviewed-by: Xu Xiaoyang xuxiaoyang2@huawei.com Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Kevin Tian kevin.tian@intel.com Reviewed-by: Max Gurtovoy mgurtovoy@nvidia.com Reviewed-by: Cornelia Huck cohuck@redhat.com Signed-off-by: Jason Gunthorpe jgg@nvidia.com Signed-off-by: Alex Williamson alex.williamson@redhat.com Signed-off-by: Xingang Wang wangxingang5@huawei.com Reviewed-by: Xu Xiaoyang xuxiaoyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/vfio/mdev/mdev_core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c index b558d4cfd082..5199e6c0467e 100644 --- a/drivers/vfio/mdev/mdev_core.c +++ b/drivers/vfio/mdev/mdev_core.c @@ -316,7 +316,10 @@ int mdev_device_create(struct kobject *kobj, mdev->dev.parent = dev; mdev->dev.bus = &mdev_bus_type; mdev->dev.release = mdev_device_release; - dev_set_name(&mdev->dev, "%pUl", uuid); + ret = dev_set_name(&mdev->dev, "%pUl", uuid); + if (ret) + goto ops_create_fail; + mdev->dev.groups = parent->ops->mdev_attr_groups; mdev->type_kobj = kobj;
From: Wang ShaoBo bobo.shaobowang@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I3YAI3 CVE: NA
-------------------------------------------------
The following error occurred occasionally on a machine that supports MPAM:
[ 13.321386][ T658] Unable to handle kernel paging request at virtual address ffff80001115816c [ 13.326013][ T684] hid-generic 0003:12D1:0003.0002: input,hidraw1: USB HID v1.10 Mouse [Keyboard/Mouse KVM 1.1.0] on usb-0000:7a:01.0-1.1/input1 [ 13.340558][ T658] Mem abort info: [ 13.340563][ T658] ESR = 0x86000007 [ 13.352567][ T5] hub 6-1:1.0: USB hub found [ 13.364750][ T658] EC = 0x21: IABT (current EL), IL = 32 bits [ 13.369891][ T5] hub 6-1:1.0: 4 ports detected [ 13.373871][ T658] SET = 0, FnV = 0 [ 13.396107][ T658] EA = 0, S1PTW = 0 [ 13.400599][ T658] swapper pgtable: 64k pages, 48-bit VAs, pgdp=0000000029540000 [ 13.408726][ T658] [ffff80001115816c] pgd=0000205fffff0003, p4d=0000205fffff0003, pud=0000205fffff0003, pmd=0000205ffffe0003, pte=0000000000000000 [ 13.423346][ T658] Internal error: Oops: 86000007 [#1] SMP [ 13.429720][ T658] Modules linked in: [ 13.434243][ T658] CPU: 72 PID: 658 Comm: kworker/72:1 Not tainted 5.10.0-4.17.0.28.oe1.aarch64 #1 [ 13.443966][ T658] Hardware name: Huawei TaiShan 200 (Model 2280)/BC82AMDDA, BIOS 1.70 01/07/2021 [ 13.453683][ T658] Workqueue: events mpam_enable [ 13.459206][ T658] pstate: 20c00009 (nzCv daif +PAN +UAO -TCO BTYPE=--) [ 13.466625][ T658] pc : mpam_enable+0x194/0x1d8 [ 13.472019][ T658] lr : mpam_enable+0x194/0x1d8 [ 13.477301][ T658] sp : ffff80004664fd70 [ 13.481937][ T658] x29: ffff80004664fd70 x28: 0000000000000000 [ 13.488578][ T658] x27: ffff00400484a648 x26: ffff800011b71080 [ 13.495306][ T658] x25: 0000000000000000 x24: ffff800011b6cda0 [ 13.502001][ T658] x23: ffff800011646f18 x22: ffff800011b6cd80 [ 13.508684][ T658] x21: ffff800011b6c000 x20: ffff800011646f08 [ 13.515425][ T658] x19: ffff800011646f70 x18: 0000000000000020 [ 13.522075][ T658] x17: 000000001790b332 x16: 0000000000000001 [ 13.528785][ T658] x15: ffffffffffffffff x14: ff00000000000000 [ 13.535464][ T658] x13: ffffffffffffffff x12: 0000000000000006 [ 13.542045][ T658] x11: 00000091cea718e2 x10: 0000000000000b90 [ 13.548735][ T658] x9 : ffff80001009ebac x8 : ffff2040061aabf0 [ 13.555383][ T658] x7 : ffffa05f8dca0000 x6 : 000000000000000f [ 13.561924][ T658] x5 : 0000000000000000 x4 : ffff2040061aa000 [ 13.568613][ T658] x3 : ffff80001164dfa0 x2 : 00000000ffffffff [ 13.575267][ T658] x1 : ffffa05f8dca0000 x0 : 00000000000000c1 [ 13.581813][ T658] Call trace: [ 13.585600][ T658] mpam_enable+0x194/0x1d8 [ 13.590450][ T658] process_one_work+0x1cc/0x390 [ 13.595654][ T658] worker_thread+0x70/0x2f0 [ 13.600499][ T658] kthread+0x118/0x120 [ 13.604935][ T658] ret_from_fork+0x10/0x18 [ 13.609717][ T658] Code: bad PC value [ 13.613944][ T658] ---[ end trace f1e305d2c339f67f ]--- [ 13.753818][ T658] Kernel panic - not syncing: Oops: Fatal exception [ 13.760885][ T658] SMP: stopping secondary CPUs [ 13.765933][ T658] Kernel Offset: disabled [ 13.770516][ T658] CPU features: 0x8040002,22208a38 [ 13.775862][ T658] Memory Limit: none [ 13.913929][ T658] ---[ end Kernel panic - not syncing:
The process of MPAM devices initialization is like this:
mpam_discovery_start() ... // discover devices mpam_discovery_complete() // hang up the mpam_online/offline_cpu callbacks -=> mpam_cpu_online() // probe all devices -=> mpam_enable() // prepare for resctrl (1) -=> cpuhp_remove_state() // clean resctrl internal structure (2) -=> cpuhp_setup_state() // rehang mpam_online/offline_cpu callbacks -=> mpam_cpu_online() // it does not call mpam_enable again -=> mpam_resctrl_cpu_online() // pull up resctrl
Re-hang process of mpam_cpu_online/offline callbacks should not be disturbed by irqs, to ensure that CPU context is reliable before re-entering mpam_cpu_online(), which always happens between (1) and (2).
Fixes: 2ab89c893faf ("arm64/mpam: resctrl: Re-synchronise resctrl's view of online CPUs") Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Reviewed-by: Xiongfeng Wang wangxiongfeng2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/kernel/mpam/mpam_device.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/arch/arm64/kernel/mpam/mpam_device.c b/arch/arm64/kernel/mpam/mpam_device.c index 86aaf52146bc..890db6a0ccaf 100644 --- a/arch/arm64/kernel/mpam/mpam_device.c +++ b/arch/arm64/kernel/mpam/mpam_device.c @@ -593,9 +593,11 @@ static void __init mpam_enable(struct work_struct *work) pr_err("Failed to setup/init resctrl\n"); mutex_unlock(&mpam_devices_lock);
+ local_irq_disable(); mpam_cpuhp_state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mpam:online", mpam_cpu_online, mpam_cpu_offline); + local_irq_enable(); if (mpam_cpuhp_state <= 0) pr_err("Failed to re-register 'dyn' cpuhp callbacks"); mutex_unlock(&mpam_cpuhp_lock);
From: Wang ShaoBo bobo.shaobowang@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4LMMF CVE: NA
-------------------------------------------------
This adds tips when rmid modification failed.
Fixes: a85aba6a1d67 ("mpam: Add support for group rmid modify") Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Reviewed-by: Xiongfeng Wang wangxiongfeng2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/kernel/mpam/mpam_resctrl.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/arch/arm64/kernel/mpam/mpam_resctrl.c b/arch/arm64/kernel/mpam/mpam_resctrl.c index a9b99a0f347f..27b59c0fbf5a 100644 --- a/arch/arm64/kernel/mpam/mpam_resctrl.c +++ b/arch/arm64/kernel/mpam/mpam_resctrl.c @@ -1848,12 +1848,14 @@ static ssize_t resctrl_group_rmid_write(struct kernfs_open_file *of,
if (rmid == 0 || rdtgrp->mon.rmid == 0) { ret = -EINVAL; + rdt_last_cmd_puts("default rmid 0 is always kept\n"); goto unlock; }
ret = rmid_to_partid_pmg(rmid, &partid, &pmg); if (ret < 0) { ret = -EINVAL; + rdt_last_cmd_puts("invalid rmid\n"); goto unlock; }
@@ -1862,6 +1864,7 @@ static ssize_t resctrl_group_rmid_write(struct kernfs_open_file *of,
if (rdtgrp->type != RDTCTRL_GROUP || !list_empty(&rdtgrp->mon.crdtgrp_list)) { + ret = -EINVAL; rdt_last_cmd_puts("unsupported operation\n"); goto unlock; }
From: Wang ShaoBo bobo.shaobowang@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4LL14 CVE: NA
-------------------------------------------------
Different from Intel-RDT, MPAM need handle more cases when monitoring, there are two label PARTID and PMG embedded into one single data stream, they may work at the same time, or only PMG works, if only PMG works, the number of PMG determines the number of resources can be monitored at the same time.
for instance(NR_PARTID equals to 2, NR_PMG equals to 2):
(1) PARTID and PMG works together RMID = PARTID + PMG*NR_PARTID 0 0 0 1 1 0 2 0 1 3 1 1
(2) only PMG works RMID = PARTID + PMG*NR_PARTID 0 0 0 PARTID=1 makes no sense 0 1 0 1 0 1 PARTID=1 makes no sense 1 1 1
Given those reasons, we should take care the usage of rmid remap matrix, two fields ( @step_size: Step size from traversing the point of matrix once @step_cnt: Indicates how many times to traverse(.e.g if cdp;step_cnt=2) ) are added to struct rmid_transform for measuring allocation and realease of monitor resource(RMIDs).
step_size is default set to 1, if only PMG(NR_PMG=4) works, makes it equals to number of columns, step_cnt means how many times are allocated and released each time, at this time rmid remap matrix looks like:
^ | ------column------>
RMID 0 1 2 3 (step_size=1) `---' `--> (step_cnt=2 if cdp enabled)
RMID 0 1 2 3 (step_size=1) `-- `--> (step_cnt=1 if cdp disabled)
if PARTID(NR_PARTID=4) and PMG(NR_PMG=4) works together, at this time rmid remap matrix looks like:
------------row------------> | | RMID 0 1 2 3 (step_size=1) | `---' | `--> (step_cnt=2 if cdp enabled) | 4 5 6 7 | 8 9 10 11 v 12 13 14 15
In addition, it also supports step_size not equal to 1, cross-line traversal, but this scenario did not happen.
Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Reviewed-by: Xiongfeng Wang wangxiongfeng2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/kernel/mpam/mpam_resctrl.c | 283 ++++++++++++++++---------- 1 file changed, 178 insertions(+), 105 deletions(-)
diff --git a/arch/arm64/kernel/mpam/mpam_resctrl.c b/arch/arm64/kernel/mpam/mpam_resctrl.c index 27b59c0fbf5a..0bfcd0b6a032 100644 --- a/arch/arm64/kernel/mpam/mpam_resctrl.c +++ b/arch/arm64/kernel/mpam/mpam_resctrl.c @@ -632,23 +632,24 @@ int closid_bitmap_init(void) * @rows: Number of bits for remap_body[:] bitmap * @clos: Number of bitmaps * @nr_usage: Number rmid we have - * @stride: Step stride from transforming rmid to partid and pmg + * @step_size: Step size from traversing the point of matrix once + * @step_cnt: Indicates how many times to traverse(.e.g if cdp;step_cnt=2) * @remap_body: Storing bitmaps' entry and itself - * @remap_enabled: Does remap_body init done */ struct rmid_transform { u32 rows; u32 cols; u32 nr_usage; - int stride; + int step_size; + int step_cnt; unsigned long **remap_body; - bool remap_enabled; }; static struct rmid_transform rmid_remap_matrix; +DEFINE_STATIC_KEY_FALSE(rmid_remap_enable_key);
static u32 get_nr_rmids(void) { - if (!rmid_remap_matrix.remap_enabled) + if (!static_branch_likely(&rmid_remap_enable_key)) return 0;
return rmid_remap_matrix.nr_usage; @@ -687,9 +688,17 @@ static int set_rmid_remap_matrix(u32 rows, u32 cols) */ hw_alloc_times_validate(times, flag); rmid_remap_matrix.cols = rounddown(cols, times); - rmid_remap_matrix.stride = times; + rmid_remap_matrix.step_cnt = times; if (times > rmid_remap_matrix.cols) return -EINVAL; + /* + * if only pmg(Performance Monitor Group) + * work on the monitor, step_size must be + * set to maximum number of columns, + * otherwise set it to 1, such as kunpeng + * 920 does. + */ + rmid_remap_matrix.step_size = 1;
/* * first row of rmid remap matrix is used for indicating @@ -733,7 +742,8 @@ static int set_rmid_remap_matrix(u32 rows, u32 cols) 0, rmid_remap_matrix.rows); }
- rmid_remap_matrix.remap_enabled = 1; + /* make column entry of rmid matrix visible */ + static_branch_enable_cpuslocked(&rmid_remap_enable_key);
return 0; clean: @@ -748,6 +758,9 @@ static int set_rmid_remap_matrix(u32 rows, u32 cols) rmid_remap_matrix.remap_body = NULL; }
+ /* if recreation failed, cannot use rmid remap matrix */ + static_branch_disable_cpuslocked(&rmid_remap_enable_key); + return ret; }
@@ -761,37 +774,101 @@ static u32 probe_rmid_remap_matrix_rows(void) return (u32)mpam_sysprops_num_pmg(); }
-static inline unsigned long **__rmid_remap_bmp(int col) +static inline unsigned long **__rmid_remap_bmp(u32 col) { - if (!rmid_remap_matrix.remap_enabled) + if (!static_branch_likely(&rmid_remap_enable_key)) return NULL;
- if ((u32)col >= rmid_remap_matrix.cols) + if (col >= rmid_remap_matrix.cols) return NULL;
return rmid_remap_matrix.remap_body + col; }
-#define for_each_rmid_remap_bmp(bmp) \ - for (bmp = __rmid_remap_bmp(0); \ - bmp <= __rmid_remap_bmp(rmid_remap_matrix.cols - 1); \ - bmp++) - -#define for_each_valid_rmid_remap_bmp(bmp) \ - for_each_rmid_remap_bmp(bmp) \ - if (bmp && *bmp) - -#define STRIDE_CHK(stride) \ - (stride == rmid_remap_matrix.stride) - -#define STRIDE_INC_CHK(stride) \ - (++stride == rmid_remap_matrix.stride) +/* + * these macros defines how can we traverse rmid remap matrix, there are + * three scenarios: + * + * (1) step_size is default set to 1, if only PMG(NR_PMG=4) works, makes + * it equals to number of columns, step_cnt means how many times are + * allocated and released each time, at this time rmid remap matrix + * looks like: + * + * ^ + * | + * ------column------> + * + * RMID 0 1 2 3 (step_size=1) + * `---' + * `--> (step_cnt=2 if cdp enabled) + * + * RMID 0 1 2 3 (step_size=1) + * `-- + * `--> (step_cnt=1 if cdp disabled) + * + * (2) if PARTID(NR_PARTID=4) and PMG(NR_PMG=4) works together, at this + * time rmid remap matrix looks like: + * + * ------------row------------> + * | + * | RMID 0 1 2 3 (step_size=1) + * | `---' + * | `--> (step_cnt=2 if cdp enabled) + * | 4 5 6 7 + * | 8 9 10 11 + * v 12 13 14 15 + * + * (3) step_size not equal to 1, cross-line traversal, but this scenario + * did not happen yet. + */
-#define STRIDE_CHK_AND_WARN(stride) \ -do { \ - if (!STRIDE_CHK(stride)) \ - WARN_ON_ONCE("Unexpected stride\n"); \ -} while (0) +#define __xy_initialize(x, y, from) \ + (x = from, y = 0) +#define __xy_overflow(x, y) \ + (y >= rmid_remap_matrix.cols) +#define __x_forward(x) \ + (x = (x + 1) % rmid_remap_matrix.cols) +#define __y_forward(x, y) \ + (y += ((x) ? 0 : 1)) + +#define __step_xy_initialize(step, x, y, from) \ + (x = from, step = 1, y = 0) +#define __step_align(from) \ + (!(from % rmid_remap_matrix.step_size)) +#define __step_overflow(step) \ + (__xy_overflow(x, y) || \ + (step > rmid_remap_matrix.step_cnt)) +#define __step_x_forward(x) \ + __x_forward(x) +#define __step_forward(step, x) \ + (step += ((x % rmid_remap_matrix.step_size) ? 0 : 1)) +#define __step_y_forward(x, y) \ + __y_forward(x, y) + +#define for_each_rmid_transform_point_step_from(p_entry, step, x, y, from) \ + for (__step_xy_initialize(step, x, y, from), \ + (p_entry) = __rmid_remap_bmp((from)); \ + __step_align(from) && !__step_overflow(step); \ + __step_x_forward(x), \ + __step_forward(step, x), \ + __step_y_forward(x, y), \ + (p_entry) = __rmid_remap_bmp(x)) \ + if (unlikely(((p_entry) == NULL) || \ + (*p_entry) == NULL)) \ + WARN_ON_ONCE(1); \ + else + +#define for_each_rmid_transform_point_from(p_entry, x, y, from) \ + for (__xy_initialize(x, y, from), \ + (p_entry) = __rmid_remap_bmp((from)); \ + !__xy_overflow(x, y); \ + __x_forward(x), \ + __y_forward(x, y), \ + (p_entry) = __rmid_remap_bmp(x)) \ + if (unlikely(((p_entry) == NULL) || \ + (*p_entry) == NULL)) \ + WARN_ON_ONCE(1); \ + else
static void set_rmid_remap_bmp_occ(unsigned long *bmp) { @@ -831,6 +908,32 @@ static int is_rmid_remap_bmp_full(unsigned long *bmp) bitmap_full(bmp, rmid_remap_matrix.rows)); }
+static int rmid_remap_bmp_find_first_avail_partid(int partid) +{ + int x, y; + unsigned long **bmp; + + if (rmid_remap_matrix.step_size == + rmid_remap_matrix.cols) + return 0; + + bmp = __rmid_remap_bmp(partid); + if (bmp && !is_rmid_remap_bmp_occ(*bmp)) + return partid; + + for_each_rmid_transform_point_from(bmp, x, y, 0) { + /* + * do not waste partid resource, start + * from step_size aligned position. + */ + if (!is_rmid_remap_bmp_occ(*bmp) && + (x % rmid_remap_matrix.step_size) == 0) + return x; + } + + return -ENOSPC; +} + static int rmid_remap_bmp_alloc_pmg(unsigned long *bmp) { int pos; @@ -845,8 +948,7 @@ static int rmid_remap_bmp_alloc_pmg(unsigned long *bmp)
static int rmid_remap_matrix_init(void) { - int stride = 0; - int ret; + int x, y, step, ret; u32 cols, rows; unsigned long **bmp;
@@ -863,15 +965,11 @@ static int rmid_remap_matrix_init(void) * default rmid, otherwise drop partid = 0 and * partid = 1 for LxCACHE, LxDATA reservation. */ - for_each_valid_rmid_remap_bmp(bmp) { + for_each_rmid_transform_point_step_from(bmp, step, x, y, 0) { set_rmid_remap_bmp_occ(*bmp); - rmid_remap_bmp_bdr_clear(*bmp, 0); - if (STRIDE_INC_CHK(stride)) - break; + rmid_remap_bmp_alloc_pmg(*bmp); }
- STRIDE_CHK_AND_WARN(stride); - ret = rmid_mon_ptrs_init(rmid_remap_matrix.nr_usage); if (ret) goto out; @@ -916,70 +1014,59 @@ static int rmid_to_partid_pmg(int rmid, int *partid, int *pmg)
static int __rmid_alloc(int partid, int pmg) { - int stride = 0; - int partid_sel = 0; - int ret; - int rmid[2] = {-1, -1}; - unsigned long **cmp, **bmp; + int x, y, step, ret, rmid; + bool checkpmg = false; + unsigned long **bmp;
- if (partid >= 0) { - cmp = __rmid_remap_bmp(partid); - if (!cmp) { - ret = -EINVAL; - goto out; - } - for_each_valid_rmid_remap_bmp(bmp) { - if (bmp < cmp) - continue; - set_rmid_remap_bmp_occ(*bmp); - - if (pmg >= 0) { - if (is_rmid_remap_bmp_bdr_set(*bmp, pmg)) { - ret = -EEXIST; - goto out; - } - rmid_remap_bmp_bdr_clear(*bmp, pmg); - } else { - ret = rmid_remap_bmp_alloc_pmg(*bmp); - if (ret < 0) - goto out; - pmg = ret; - } + if (pmg >= 0) + checkpmg = true;
- rmid[stride] = to_rmid(partid + stride, pmg); - if (STRIDE_INC_CHK(stride)) - break; - } - } else { - for_each_valid_rmid_remap_bmp(bmp) { - partid_sel++; + /* traverse from first non-occupied and step_size aligned entry */ + ret = rmid_remap_bmp_find_first_avail_partid(partid); + if (ret < 0) + goto out; + partid = ret;
- if (is_rmid_remap_bmp_occ(*bmp)) - continue; - set_rmid_remap_bmp_occ(*bmp); + for_each_rmid_transform_point_step_from(bmp, step, x, y, partid) { + set_rmid_remap_bmp_occ(*bmp);
- ret = rmid_remap_bmp_alloc_pmg(*bmp); - if (ret < 0) + /* checking if the given pmg is available */ + if (checkpmg) { + /* + * it can only happened in step_size aligned + * position, so it does not exist pmgs cleared + * before. + */ + if (is_rmid_remap_bmp_bdr_set(*bmp, pmg + y)) { + ret = -EEXIST; goto out; - pmg = ret; - rmid[stride] = to_rmid(partid_sel - 1, pmg); - if (STRIDE_INC_CHK(stride)) - break; + } + rmid_remap_bmp_bdr_clear(*bmp, pmg + y); + continue; } + + /* alloc available pmg */ + ret = rmid_remap_bmp_alloc_pmg(*bmp); + if (ret < 0) + goto out; + /* always return first pmg */ + if (pmg < 0) + pmg = ret; }
- if (!STRIDE_CHK(stride)) { + rmid = to_rmid(partid, pmg); + if (!is_rmid_valid(rmid)) { ret = -ENOSPC; goto out; } - - ret = assoc_rmid_with_mon(rmid[0]); - if (ret) + ret = assoc_rmid_with_mon(rmid); + if (ret) { + rmid_free(rmid); goto out; + }
- return rmid[0]; + return rmid; out: - rmid_free(rmid[0]); return ret; }
@@ -990,32 +1077,18 @@ int rmid_alloc(int partid)
void rmid_free(int rmid) { - int stride = 0; - int partid, pmg; - unsigned long **bmp, **cmp; + int x, y, step, partid, pmg; + unsigned long **bmp;
if (rmid_to_partid_pmg(rmid, &partid, &pmg)) return;
- cmp = __rmid_remap_bmp(partid); - if (!cmp) - return; - - for_each_valid_rmid_remap_bmp(bmp) { - if (bmp < cmp) - continue; - - rmid_remap_bmp_bdr_set(*bmp, pmg); - + for_each_rmid_transform_point_step_from(bmp, step, x, y, partid) { + rmid_remap_bmp_bdr_set(*bmp, pmg + y); if (is_rmid_remap_bmp_full(*bmp)) unset_rmid_remap_bmp_occ(*bmp); - - if (STRIDE_INC_CHK(stride)) - break; }
- STRIDE_CHK_AND_WARN(stride); - deassoc_rmid_with_mon(rmid); }
From: Li Zefan lizefan@huawei.com
euler inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4OPKC CVE: NA
-------------------------------------------------
Signed-off-by: Pavel Emelyanov xemul@parallels.com Signed-off-by: Li Zefan lizefan@huawei.com Signed-off-by: luojiajun luojiajun3@huawei.com Reviewed-by: Li Zefan lizefan@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Cui GaoSheng cuigaosheng1@huawei.com Reviewed-by: weiyang wang wangweiyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/pid.h | 3 --- include/linux/pid_namespace.h | 1 + kernel/pid.c | 45 ++++++++++++++++++++++++++++++----- kernel/pid_namespace.c | 6 +++-- kernel/sysctl.c | 9 ------- kernel/trace/trace.c | 4 ++-- kernel/trace/trace.h | 2 -- 7 files changed, 46 insertions(+), 24 deletions(-)
diff --git a/include/linux/pid.h b/include/linux/pid.h index fa10acb8d6a4..34afff2dc888 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -110,9 +110,6 @@ extern void transfer_pid(struct task_struct *old, struct task_struct *new, struct pid_namespace; extern struct pid_namespace init_pid_ns;
-extern int pid_max; -extern int pid_max_min, pid_max_max; - /* * look up a PID in the hash table. Must be called with the tasklist_lock * or rcu_read_lock() held. diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 5a5cb45ac57e..b202733aa6cc 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -31,6 +31,7 @@ struct pid_namespace { #endif struct user_namespace *user_ns; struct ucounts *ucounts; + int pid_max; int reboot; /* group exit code if this pidns was rebooted */ struct ns_common ns; } __randomize_layout; diff --git a/kernel/pid.c b/kernel/pid.c index be2ec1d26896..28fdf3dc1005 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -43,6 +43,7 @@ #include <linux/sched/task.h> #include <linux/idr.h> #include <net/sock.h> +#include <linux/kmemleak.h> #include <uapi/linux/pidfd.h> #ifdef CONFIG_PID_RESERVE #include <linux/pin_mem.h> @@ -62,12 +63,10 @@ struct pid init_struct_pid = { }, } };
-int pid_max = PID_MAX_DEFAULT; - #define RESERVED_PIDS 300
-int pid_max_min = RESERVED_PIDS + 1; -int pid_max_max = PID_MAX_LIMIT; +static int pid_max_min = RESERVED_PIDS + 1; +static int pid_max_max = PID_MAX_LIMIT;
/* * PID-map pages start out as NULL, they get allocated upon @@ -83,6 +82,7 @@ struct pid_namespace init_pid_ns = { .child_reaper = &init_task, .user_ns = &init_user_ns, .ns.inum = PROC_PID_INIT_INO, + .pid_max = PID_MAX_DEFAULT, #ifdef CONFIG_PID_NS .ns.ops = &pidns_operations, #endif @@ -194,7 +194,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, tid = set_tid[ns->level - i];
retval = -EINVAL; - if (tid < 1 || tid >= pid_max) + if (tid < 1 || tid >= task_active_pid_ns(current)->pid_max) goto out_free; /* * Also fail if a PID != 1 is requested and @@ -237,7 +237,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, * a partially initialized PID (see below). */ nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, - pid_max, GFP_ATOMIC); + tmp->pid_max, GFP_ATOMIC); } spin_unlock_irq(&pidmap_lock); idr_preload_end(); @@ -612,8 +612,37 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) return fd; }
+static int proc_dointvec_pidmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table tmp; + + tmp = *table; + tmp.data = &task_active_pid_ns(current)->pid_max; + + return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); +} + +static struct ctl_table pid_ctl_table[] = { + { + .procname = "pid_max", + .data = &init_pid_ns.pid_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_pidmax, + .extra1 = &pid_max_min, + .extra2 = &pid_max_max, + }, + {} +}; + +static struct ctl_path pid_kern_path[] = { { .procname = "kernel" }, {} }; + void __init pid_idr_init(void) { + struct ctl_table_header *hdr; + int pid_max = init_pid_ns.pid_max; + /* Verify no one has done anything silly: */ BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
@@ -624,6 +653,8 @@ void __init pid_idr_init(void) PIDS_PER_CPU_MIN * num_possible_cpus()); pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
+ init_pid_ns.pid_max = pid_max; + idr_init(&init_pid_ns.idr);
init_pid_ns.pid_cachep = KMEM_CACHE(pid, @@ -632,6 +663,8 @@ void __init pid_idr_init(void) if (is_need_reserve_pids()) reserve_pids(&init_pid_ns.idr, pid_max); #endif + hdr = register_sysctl_paths(pid_kern_path, pid_ctl_table); + kmemleak_not_leak(hdr); }
static struct file *__pidfd_fget(struct task_struct *task, int fd) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 52c017feabcb..c290d21b6c24 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -109,6 +109,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->user_ns = get_user_ns(user_ns); ns->ucounts = ucounts; ns->pid_allocated = PIDNS_ADDING; + ns->pid_max = parent_pid_ns->pid_max;
return ns;
@@ -282,6 +283,8 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, next = idr_get_cursor(&pid_ns->idr) - 1;
tmp.data = &next; + tmp.extra2 = &pid_ns->pid_max; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (!ret && write) idr_set_cursor(&pid_ns->idr, next + 1); @@ -289,7 +292,6 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, return ret; }
-extern int pid_max; static struct ctl_table pid_ns_ctl_table[] = { { .procname = "ns_last_pid", @@ -297,7 +299,7 @@ static struct ctl_table pid_ns_ctl_table[] = { .mode = 0666, /* permissions are checked in the handler */ .proc_handler = pid_ns_ctl_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &pid_max, + .extra2 = &init_pid_ns.pid_max, }, { } }; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 749ef59224e2..6eb2b1e88f0c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2231,15 +2231,6 @@ static struct ctl_table kern_table[] = { .extra2 = SYSCTL_ONE, }, #endif /* CONFIG_SMP */ - { - .procname = "pid_max", - .data = &pid_max, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &pid_max_min, - .extra2 = &pid_max_max, - }, { .procname = "panic_on_oops", .data = &panic_on_oops, diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e4f154119e52..2ce366687ce4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -693,7 +693,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, return -ENOMEM; }
- pid_list->pid_max = READ_ONCE(pid_max); + pid_list->pid_max = READ_ONCE(init_pid_ns.pid_max);
/* Only truncating will shrink pid_max */ if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max) @@ -4896,7 +4896,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
if (mask == TRACE_ITER_RECORD_TGID) { if (!tgid_map) { - tgid_map_max = pid_max; + tgid_map_max = init_pid_ns.pid_max; map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map), GFP_KERNEL);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 15a811d34cd8..94f8087a3c22 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -806,8 +806,6 @@ extern unsigned long tracing_thresh;
/* PID filtering */
-extern int pid_max; - bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid); bool trace_ignore_this_task(struct trace_pid_list *filtered_pids,
From: Kefeng Wang wangkefeng.wang@huawei.com
maillist inclusion category: Feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NDAW CVE: NA
Reference: https://lore.kernel.org/lkml/20211226083912.166512-4-wangkefeng.wang@huawei....
-------------------
Add HUGE_VMALLOC_DEFAULT_ENABLED to let user to choose whether or not enable huge vmalloc mappings by default, and this could make more architectures to enable huge vmalloc mappings feature but don't want to enable it by default.
Add hugevmalloc=on/off parameter to enable or disable this feature at boot time, nohugevmalloc is still supported and equivalent to hugevmalloc=off.
Signed-off-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../admin-guide/kernel-parameters.txt | 11 +++++++++++ arch/powerpc/Kconfig | 1 + mm/Kconfig | 7 +++++++ mm/vmalloc.c | 18 +++++++++++++++++- 4 files changed, 36 insertions(+), 1 deletion(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 609cd9d4ca89..a84f62b61dbc 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1587,6 +1587,17 @@ on: enable the feature off: disable the feature
+ hugevmalloc [PPC] Requires CONFIG_HAVE_ARCH_HUGE_VMALLOC + Format: { on | off } + Default set by CONFIG_HUGE_VMALLOC_DEFAULT_ENABLED. + + This parameter enables/disables kernel huge vmalloc + mappings at boot time. + + on: Enable the feature + off: Disable the feature + Equivalent to: nohugevmalloc + hung_task_panic= [KNL] Should the hung task detector generate panics. Format: 0 | 1 diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index da2b1c3b9ae4..4e6f30473a56 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -180,6 +180,7 @@ config PPC select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_HUGE_VMAP if PPC_BOOK3S_64 && PPC_RADIX_MMU select HAVE_ARCH_HUGE_VMALLOC if HAVE_ARCH_HUGE_VMAP + select HUGE_VMALLOC_DEFAULT_ENABLED if HAVE_ARCH_HUGE_VMALLOC select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_KASAN if PPC32 && PPC_PAGE_SHIFT <= 14 select HAVE_ARCH_KASAN_VMALLOC if PPC32 && PPC_PAGE_SHIFT <= 14 diff --git a/mm/Kconfig b/mm/Kconfig index 8207683afaf2..1ba477dee3ae 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -284,6 +284,13 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION config ARCH_ENABLE_THP_MIGRATION bool
+config HUGE_VMALLOC_DEFAULT_ENABLED + bool "Enable huge vmalloc mappings by default" + depends on HAVE_ARCH_HUGE_VMALLOC + help + Enable huge vmalloc mappings by default, this value could be overridden + by hugevmalloc=off|on. + config CONTIG_ALLOC def_bool (MEMORY_ISOLATION && COMPACTION) || CMA
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 1b4838f6454d..aa46f5028d17 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -43,7 +43,7 @@ #include "pgalloc-track.h"
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC -static bool __ro_after_init vmap_allow_huge = true; +static bool __ro_after_init vmap_allow_huge = IS_ENABLED(CONFIG_HUGE_VMALLOC_DEFAULT_ENABLED);
static int __init set_nohugevmalloc(char *str) { @@ -51,6 +51,22 @@ static int __init set_nohugevmalloc(char *str) return 0; } early_param("nohugevmalloc", set_nohugevmalloc); + +static int __init set_hugevmalloc(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "on")) + vmap_allow_huge = true; + else if (!strcmp(str, "off")) + vmap_allow_huge = false; + else + return -EINVAL; + + return 0; +} +early_param("hugevmalloc", set_hugevmalloc); #else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ static const bool vmap_allow_huge = false; #endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
From: Kefeng Wang wangkefeng.wang@huawei.com
maillist inclusion category: Feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NDAW CVE: NA
Reference: https://lore.kernel.org/lkml/20211226083912.166512-4-wangkefeng.wang@huawei....
-------------------
This patch select HAVE_ARCH_HUGE_VMALLOC to let arm64 support huge vmalloc mappings, it is disabled by default, use hugevmalloc=on to enable it in some scenarios.
Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Signed-off-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- Documentation/admin-guide/kernel-parameters.txt | 4 ++-- arch/arm64/Kconfig | 1 + arch/arm64/kernel/module.c | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a84f62b61dbc..5eaa1b8a6c6e 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1587,7 +1587,7 @@ on: enable the feature off: disable the feature
- hugevmalloc [PPC] Requires CONFIG_HAVE_ARCH_HUGE_VMALLOC + hugevmalloc [KNL,PPC,ARM64] Requires CONFIG_HAVE_ARCH_HUGE_VMALLOC Format: { on | off } Default set by CONFIG_HUGE_VMALLOC_DEFAULT_ENABLED.
@@ -3312,7 +3312,7 @@
nohugeiomap [KNL,X86,PPC,ARM64] Disable kernel huge I/O mappings.
- nohugevmalloc [PPC] Disable kernel huge vmalloc mappings. + nohugevmalloc [KNL,PPC,ARM64] Disable kernel huge vmalloc mappings.
nosmt [KNL,S390] Disable symmetric multithreading (SMT). Equivalent to smt=1. diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 44f1bf1a5b08..df6176b55fc8 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -133,6 +133,7 @@ config ARM64 select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_BITREVERSE select HAVE_ARCH_COMPILER_H + select HAVE_ARCH_HUGE_VMALLOC select HAVE_ARCH_HUGE_VMAP select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_JUMP_LABEL_RELATIVE diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index 2a1ad95d9b2c..031be3c6a9d5 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -35,7 +35,7 @@ void *module_alloc(unsigned long size) module_alloc_end = MODULES_END;
p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, - module_alloc_end, gfp_mask, PAGE_KERNEL, 0, + module_alloc_end, gfp_mask, PAGE_KERNEL, VM_NO_HUGE_VMAP, NUMA_NO_NODE, __builtin_return_address(0));
if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && @@ -51,7 +51,7 @@ void *module_alloc(unsigned long size) */ p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, module_alloc_base + SZ_2G, GFP_KERNEL, - PAGE_KERNEL, 0, NUMA_NO_NODE, + PAGE_KERNEL, VM_NO_HUGE_VMAP, NUMA_NO_NODE, __builtin_return_address(0));
if (p && (kasan_module_alloc(p, size) < 0)) {
From: Kefeng Wang wangkefeng.wang@huawei.com
maillist inclusion category: Feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NDAW CVE: NA
Reference: https://lore.kernel.org/lkml/20211226083912.166512-4-wangkefeng.wang@huawei....
-------------------
This patch select HAVE_ARCH_HUGE_VMALLOC to let X86_64 and X86_PAE support huge vmalloc mappings, it is disabled by default, use hugevmalloc=on to enable it.
Cc: Thomas Gleixner tglx@linutronix.de Cc: Ingo Molnar mingo@redhat.com Cc: Borislav Petkov bp@alien8.de Cc: Dave Hansen dave.hansen@linux.intel.com Cc: "H. Peter Anvin" hpa@zytor.com Signed-off-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- Documentation/admin-guide/kernel-parameters.txt | 4 ++-- arch/x86/Kconfig | 1 + arch/x86/kernel/module.c | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 5eaa1b8a6c6e..de8f7d447295 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1587,7 +1587,7 @@ on: enable the feature off: disable the feature
- hugevmalloc [KNL,PPC,ARM64] Requires CONFIG_HAVE_ARCH_HUGE_VMALLOC + hugevmalloc [KNL,PPC,ARM64,X86] Requires CONFIG_HAVE_ARCH_HUGE_VMALLOC Format: { on | off } Default set by CONFIG_HUGE_VMALLOC_DEFAULT_ENABLED.
@@ -3312,7 +3312,7 @@
nohugeiomap [KNL,X86,PPC,ARM64] Disable kernel huge I/O mappings.
- nohugevmalloc [KNL,PPC,ARM64] Disable kernel huge vmalloc mappings. + nohugevmalloc [KNL,PPC,ARM64,X86] Disable kernel huge vmalloc mappings.
nosmt [KNL,S390] Disable symmetric multithreading (SMT). Equivalent to smt=1. diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 06174d40f1e6..2a625f0d01c8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -143,6 +143,7 @@ config X86 select HAVE_ACPI_APEI_NMI if ACPI select HAVE_ALIGNED_STRUCT_PAGE if SLUB select HAVE_ARCH_AUDITSYSCALL + select HAVE_ARCH_HUGE_VMALLOC if HAVE_ARCH_HUGE_VMAP select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_JUMP_LABEL_RELATIVE diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 5e9a34b5bd74..a1eadaa41a95 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -75,7 +75,7 @@ void *module_alloc(unsigned long size) p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR + get_module_load_offset(), MODULES_END, GFP_KERNEL, - PAGE_KERNEL, 0, NUMA_NO_NODE, + PAGE_KERNEL, VM_NO_HUGE_VMAP, NUMA_NO_NODE, __builtin_return_address(0)); if (p && (kasan_module_alloc(p, size) < 0)) { vfree(p);
From: Weilong Chen chenweilong@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4K2U5 CVE: NA
-------------------------------------------------
Support disable oom-killer, and report oom events to bbox vm.enable_oom_killer: 0: disable oom killer 1: enable oom killer (default,compatible with mainline)
Signed-off-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zhang Jian zhangjian210@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/Kconfig | 12 ++++++++++++ include/linux/oom.h | 26 ++++++++++++++++++++++++++ kernel/sysctl.c | 11 +++++++++++ mm/memcontrol.c | 5 ++++- mm/oom_kill.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ mm/util.c | 2 ++ 6 files changed, 100 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index df6176b55fc8..3afabc81551c 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2010,6 +2010,18 @@ config ASCEND_CLEAN_CDM cause the kernel crash if the essential structures went wrong. So move the management structures for hbm nodes to the ddr nodes of the same partion to reduce the probability of kernel crashes. + +config ASCEND_OOM + bool "Enable support for disable oom killer" + default y + help + In some cases we hopes that the oom will not kill the process when it occurs, + be able to notify the black box to report the event, and be able to trigger + the panic to locate the problem. + vm.enable_oom_killer: + 0: disable oom killer + 1: enable oom killer (default,compatible with mainline) + endif
endmenu diff --git a/include/linux/oom.h b/include/linux/oom.h index 2db9a1432511..d57de7b1e75a 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -127,4 +127,30 @@ extern struct task_struct *find_lock_task_mm(struct task_struct *p); extern int sysctl_oom_dump_tasks; extern int sysctl_oom_kill_allocating_task; extern int sysctl_panic_on_oom; +extern int sysctl_enable_oom_killer; + +#define OOM_TYPE_NOMEM 0 +#define OOM_TYPE_OVERCOMMIT 1 +#define OOM_TYPE_CGROUP 2 + +#ifdef CONFIG_ASCEND_OOM +extern int register_hisi_oom_notifier(struct notifier_block *nb); +extern int unregister_hisi_oom_notifier(struct notifier_block *nb); +int oom_type_notifier_call(unsigned int type, struct oom_control *oc); +#else +static inline int register_hisi_oom_notifier(struct notifier_block *nb) +{ + return -EINVAL; +} + +static inline int unregister_hisi_oom_notifier(struct notifier_block *nb) +{ + return -EINVAL; +} + +static inline int oom_type_notifier_call(unsigned int type, struct oom_control *oc) +{ + return -EINVAL; +} +#endif #endif /* _INCLUDE_LINUX_OOM_H */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6eb2b1e88f0c..ce200213ccbd 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2761,6 +2761,17 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &two, }, +#ifdef CONFIG_ASCEND_OOM + { + .procname = "enable_oom_killer", + .data = &sysctl_enable_oom_killer, + .maxlen = sizeof(sysctl_enable_oom_killer), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif { .procname = "oom_kill_allocating_task", .data = &sysctl_oom_kill_allocating_task, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b5ba2dd2d468..99cfd840e3bd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1895,6 +1895,7 @@ static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int current->memcg_in_oom = memcg; current->memcg_oom_gfp_mask = mask; current->memcg_oom_order = order; + oom_type_notifier_call(OOM_TYPE_CGROUP, NULL);
return OOM_ASYNC; } @@ -1962,12 +1963,14 @@ bool mem_cgroup_oom_synchronize(bool handle) if (locked) mem_cgroup_oom_notify(memcg);
- if (locked && !memcg->oom_kill_disable) { + if (locked && !memcg->oom_kill_disable && sysctl_enable_oom_killer) { mem_cgroup_unmark_under_oom(memcg); finish_wait(&memcg_oom_waitq, &owait.wait); mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask, current->memcg_oom_order); } else { + oom_type_notifier_call(OOM_TYPE_CGROUP, NULL); + schedule(); mem_cgroup_unmark_under_oom(memcg); finish_wait(&memcg_oom_waitq, &owait.wait); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 57300369efe3..ffbe8fe2bbf6 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -54,6 +54,7 @@ int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; int sysctl_oom_dump_tasks = 1; +int sysctl_enable_oom_killer = 1;
/* * Serializes oom killer invocations (out_of_memory()) from all contexts to @@ -1081,6 +1082,45 @@ int unregister_oom_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(unregister_oom_notifier);
+#ifdef CONFIG_ASCEND_OOM +static BLOCKING_NOTIFIER_HEAD(oom_type_notify_list); + +int register_hisi_oom_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&oom_type_notify_list, nb); +} +EXPORT_SYMBOL_GPL(register_hisi_oom_notifier); + +int unregister_hisi_oom_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&oom_type_notify_list, nb); +} +EXPORT_SYMBOL_GPL(unregister_hisi_oom_notifier); + +int oom_type_notifier_call(unsigned int type, struct oom_control *oc) +{ + struct oom_control oc_tmp = { 0 }; + static unsigned long caller_jiffies; + + if (sysctl_enable_oom_killer) + return -EINVAL; + + if (oc) + type = is_memcg_oom(oc) ? OOM_TYPE_CGROUP : OOM_TYPE_NOMEM; + else + oc = &oc_tmp; + + if (printk_timed_ratelimit(&caller_jiffies, 10000)) { + pr_err("OOM_NOTIFIER: oom type %u\n", type); + dump_stack(); + show_mem(SHOW_MEM_FILTER_NODES, NULL); + dump_tasks(oc); + } + + return blocking_notifier_call_chain(&oom_type_notify_list, type, NULL); +} +#endif + /** * out_of_memory - kill the "best" process when we run out of memory * @oc: pointer to struct oom_control @@ -1097,6 +1137,11 @@ bool out_of_memory(struct oom_control *oc) if (oom_killer_disabled) return false;
+ if (!sysctl_enable_oom_killer) { + oom_type_notifier_call(0, oc); + return false; + } + if (!is_memcg_oom(oc)) { blocking_notifier_call_chain(&oom_notify_list, 0, &freed); if (freed > 0) diff --git a/mm/util.c b/mm/util.c index 47d074912630..2350c064abc6 100644 --- a/mm/util.c +++ b/mm/util.c @@ -25,6 +25,7 @@ #include <linux/compat.h>
#include <linux/uaccess.h> +#include <linux/oom.h>
#include "internal.h"
@@ -913,6 +914,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; error: + oom_type_notifier_call(OOM_TYPE_OVERCOMMIT, NULL); vm_unacct_memory(pages);
return -ENOMEM;
From: Zhang Jian zhangjian210@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4K2U5 CVE: NA
-------------------------------------------------
Enable the ascend oom control features for openeuler_defconfig default config.
Signed-off-by: Zhang Jian zhangjian210@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 + 1 file changed, 1 insertion(+)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 6cdcbad3967e..fb5b9650b458 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -476,6 +476,7 @@ CONFIG_RANDOMIZE_MODULE_REGION_FULL=y CONFIG_ASCEND_FEATURES=y CONFIG_ASCEND_DVPP_MMAP=y CONFIG_ASCEND_CLEAN_CDM=y +CONFIG_ASCEND_OOM=y # end of Kernel Features
#
From: Lijun Fang fanglijun3@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4JMM0 CVE: NA -------------------
Change svm to modules by default. Remove get mem info functions, users can get the meminfo from procfs.
Signed-off-by: Lijun Fang fanglijun3@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/char/Kconfig | 2 +- drivers/char/svm.c | 215 +++---------------------------------------- 2 files changed, 16 insertions(+), 201 deletions(-)
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index c80a5c641634..f75f9e3ea324 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -479,7 +479,7 @@ config PIN_MEMORY_DEV pin memory driver
config HISI_SVM - bool "Hisilicon svm driver" + tristate "Hisilicon svm driver" depends on ARM64 && ARM_SMMU_V3 && MMU_NOTIFIER default m help diff --git a/drivers/char/svm.c b/drivers/char/svm.c index b85283118417..3b591f197af6 100644 --- a/drivers/char/svm.c +++ b/drivers/char/svm.c @@ -38,9 +38,7 @@
#define SVM_IOCTL_REMAP_PROC 0xfff4 #define SVM_IOCTL_UNPIN_MEMORY 0xfff5 -#define SVM_IOCTL_GETHUGEINFO 0xfff6 #define SVM_IOCTL_PIN_MEMORY 0xfff7 -#define SVM_IOCTL_GET_PHYMEMINFO 0xfff8 #define SVM_IOCTL_GET_PHYS 0xfff9 #define SVM_IOCTL_LOAD_FLAG 0xfffa #define SVM_IOCTL_SET_RC 0xfffc @@ -120,23 +118,6 @@ struct svm_proc_mem { u64 buf; };
-struct meminfo { - unsigned long hugetlbfree; - unsigned long hugetlbtotal; -}; - -struct phymeminfo { - unsigned long normal_total; - unsigned long normal_free; - unsigned long huge_total; - unsigned long huge_free; -}; - -struct phymeminfo_ioctl { - struct phymeminfo *info; - unsigned long nodemask; -}; - static char *svm_cmd_to_string(unsigned int cmd) { switch (cmd) { @@ -150,10 +131,6 @@ static char *svm_cmd_to_string(unsigned int cmd) return "pin memory"; case SVM_IOCTL_UNPIN_MEMORY: return "unpin memory"; - case SVM_IOCTL_GETHUGEINFO: - return "get hugeinfo"; - case SVM_IOCTL_GET_PHYMEMINFO: - return "get physical memory info"; case SVM_IOCTL_REMAP_PROC: return "remap proc"; case SVM_IOCTL_LOAD_FLAG: @@ -853,11 +830,7 @@ static struct task_struct *svm_get_task(struct svm_bind_process params) if (params.flags & SVM_BIND_PID) { struct mm_struct *mm = NULL;
- rcu_read_lock(); - task = find_task_by_vpid(params.vpid); - if (task) - get_task_struct(task); - rcu_read_unlock(); + task = find_get_task_by_vpid(params.vpid); if (task == NULL) return ERR_PTR(-ESRCH);
@@ -967,7 +940,7 @@ static pte_t *svm_get_pte(struct vm_area_struct *vma,
if (is_vm_hugetlb_page(vma)) { if (pud_present(*pud)) { - if (pud_huge(*pud)) { + if (pud_val(*pud) && !(pud_val(*pud) & PUD_TABLE_BIT)) { pte = (pte_t *)pud; *offset = addr & (PUD_SIZE - 1); size = PUD_SIZE; @@ -989,8 +962,6 @@ static pte_t *svm_get_pte(struct vm_area_struct *vma, pte = (pte_t *)pmd; *offset = addr & (PMD_SIZE - 1); size = PMD_SIZE; - } else if (pmd_trans_unstable(pmd)) { - pr_warn("%s: thp unstable\n", __func__); } else { pte = pte_offset_map(pmd, addr); *offset = addr & (PAGE_SIZE - 1); @@ -1019,15 +990,15 @@ static pte_t *svm_walk_pt(unsigned long addr, unsigned long *page_size, return NULL;
pgd = pgd_offset(mm, addr); - if (pgd_none_or_clear_bad(pgd)) + if (pgd_none(*pgd)) return NULL;
p4d = p4d_offset(pgd, addr); - if (p4d_none_or_clear_bad(p4d)) + if (p4d_none(*p4d)) return NULL;
pud = pud_offset(p4d, addr); - if (pud_none_or_clear_bad(pud)) + if (pud_none(*pud)) return NULL;
return svm_get_pte(vma, pud, addr, page_size, offset); @@ -1155,95 +1126,6 @@ static int svm_set_rc(unsigned long __user *arg) return 0; }
-static long svm_get_hugeinfo(unsigned long __user *arg) -{ - struct hstate *h = &default_hstate; - struct meminfo info; - - if (!acpi_disabled) - return -EPERM; - - if (arg == NULL) - return -EINVAL; - - if (!hugepages_supported()) - return -ENOTSUPP; - - info.hugetlbfree = h->free_huge_pages; - info.hugetlbtotal = h->nr_huge_pages; - - if (copy_to_user((void __user *)arg, &info, sizeof(info))) - return -EFAULT; - - pr_info("svm get hugetlb info: order(%u), max_huge_pages(%lu)," - "nr_huge_pages(%lu), free_huge_pages(%lu), resv_huge_pages(%lu)", - h->order, - h->max_huge_pages, - h->nr_huge_pages, - h->free_huge_pages, - h->resv_huge_pages); - - return 0; -} - -static void svm_get_node_memory_info_inc(unsigned long nid, struct phymeminfo *info) -{ - struct sysinfo i; - struct hstate *h = &default_hstate; - unsigned long huge_free = 0; - unsigned long huge_total = 0; - - if (hugepages_supported()) { - huge_free = h->free_huge_pages_node[nid] * (PAGE_SIZE << huge_page_order(h)); - huge_total = h->nr_huge_pages_node[nid] * (PAGE_SIZE << huge_page_order(h)); - } - -#ifdef CONFIG_NUMA - si_meminfo_node(&i, nid); -#else - si_meminfo(&i); -#endif - info->normal_free += i.freeram * PAGE_SIZE; - info->normal_total += i.totalram * PAGE_SIZE - huge_total; - info->huge_total += huge_total; - info->huge_free += huge_free; -} - -static void __svm_get_memory_info(unsigned long nodemask, struct phymeminfo *info) -{ - memset(info, 0x0, sizeof(struct phymeminfo)); - - nodemask = nodemask & ((1UL << MAX_NUMNODES) - 1); - - while (nodemask) { - unsigned long nid = find_first_bit(&nodemask, BITS_PER_LONG); - - if (node_isset(nid, node_online_map)) - (void)svm_get_node_memory_info_inc(nid, info); - - nodemask &= ~(1UL << nid); - } -} - -static long svm_get_phy_memory_info(unsigned long __user *arg) -{ - struct phymeminfo info; - struct phymeminfo_ioctl para; - - if (arg == NULL) - return -EINVAL; - - if (copy_from_user(¶, (void __user *)arg, sizeof(para))) - return -EFAULT; - - __svm_get_memory_info(para.nodemask, &info); - - if (copy_to_user((void __user *)para.info, &info, sizeof(info))) - return -EFAULT; - - return 0; -} - static long svm_remap_get_phys(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long *phys, unsigned long *page_size, unsigned long *offset) @@ -1259,15 +1141,15 @@ static long svm_remap_get_phys(struct mm_struct *mm, struct vm_area_struct *vma, return err;
pgd = pgd_offset(mm, addr); - if (pgd_none_or_clear_bad(pgd)) + if (pgd_none(*pgd)) return err;
p4d = p4d_offset(pgd, addr); - if (p4d_none_or_clear_bad(p4d)) + if (p4d_none(*p4d)) return err;
pud = pud_offset(p4d, addr); - if (pud_none_or_clear_bad(pud)) + if (pud_none(*pud)) return err;
pte = svm_get_pte(vma, pud, addr, page_size, offset); @@ -1308,11 +1190,9 @@ static long svm_remap_proc(unsigned long __user *arg) return -EINVAL; }
- rcu_read_lock(); if (pmem.pid) { - ptask = find_task_by_vpid(pmem.pid); + ptask = find_get_task_by_vpid(pmem.pid); if (!ptask) { - rcu_read_unlock(); pr_err("No task for this pid\n"); return -EINVAL; } @@ -1320,8 +1200,6 @@ static long svm_remap_proc(unsigned long __user *arg) ptask = current; }
- get_task_struct(ptask); - rcu_read_unlock(); pmm = ptask->mm;
down_read(&mm->mmap_lock); @@ -1408,65 +1286,6 @@ static int svm_proc_load_flag(int __user *arg) return put_user(flag, arg); }
-static unsigned long svm_get_unmapped_area(struct file *file, - unsigned long addr0, unsigned long len, - unsigned long pgoff, unsigned long flags) -{ - unsigned long addr = addr0; - struct mm_struct *mm = current->mm; - struct vm_unmapped_area_info info; - struct svm_device *sdev = file_to_sdev(file); - - if (!acpi_disabled) - return -EPERM; - - if (flags & MAP_FIXED) { - if (IS_ALIGNED(addr, len)) - return addr; - - dev_err(sdev->dev, "MAP_FIXED but not aligned\n"); - return -EINVAL; //lint !e570 - } - - if (addr) { - struct vm_area_struct *vma = NULL; - - addr = ALIGN(addr, len); - - if (dvpp_mmap_check(addr, len, flags)) - return -ENOMEM; - - vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && - (vma == NULL || addr + len <= vm_start_gap(vma))) - return addr; - } - - info.flags = VM_UNMAPPED_AREA_TOPDOWN; - info.length = len; - info.low_limit = max(PAGE_SIZE, mmap_min_addr); - info.high_limit = ((mm->mmap_base <= DVPP_MMAP_BASE) ? - mm->mmap_base : DVPP_MMAP_BASE); - info.align_mask = ((len >> PAGE_SHIFT) - 1) << PAGE_SHIFT; - info.align_offset = pgoff << PAGE_SHIFT; - - addr = vm_unmapped_area(&info); - - if (offset_in_page(addr)) { - VM_BUG_ON(addr != -ENOMEM); - info.flags = 0; - info.low_limit = TASK_UNMAPPED_BASE; - info.high_limit = DVPP_MMAP_BASE; - - if (enable_mmap_dvpp) - dvpp_mmap_get_area(&info, flags); - - addr = vm_unmapped_area(&info); - } - - return addr; -} - static int svm_mmap(struct file *file, struct vm_area_struct *vma) { int err; @@ -1622,12 +1441,6 @@ static long svm_ioctl(struct file *file, unsigned int cmd, case SVM_IOCTL_UNPIN_MEMORY: err = svm_unpin_memory((unsigned long __user *)arg); break; - case SVM_IOCTL_GETHUGEINFO: - err = svm_get_hugeinfo((unsigned long __user *)arg); - break; - case SVM_IOCTL_GET_PHYMEMINFO: - err = svm_get_phy_memory_info((unsigned long __user *)arg); - break; case SVM_IOCTL_REMAP_PROC: err = svm_remap_proc((unsigned long __user *)arg); break; @@ -1652,7 +1465,6 @@ static const struct file_operations svm_fops = { .owner = THIS_MODULE, .open = svm_open, .mmap = svm_mmap, - .get_unmapped_area = svm_get_unmapped_area, .unlocked_ioctl = svm_ioctl, };
@@ -1690,6 +1502,7 @@ static int svm_acpi_add_core(struct svm_device *sdev, struct core_device *cdev = NULL; char *name = NULL; enum dev_dma_attr attr; + const union acpi_object *obj;
name = devm_kasprintf(sdev->dev, GFP_KERNEL, "svm_child_dev%d", id); if (name == NULL) @@ -1714,7 +1527,7 @@ static int svm_acpi_add_core(struct svm_device *sdev, return err; }
- attr = acpi_get_dma_attr(children); + attr = device_get_dma_attr(&children->dev); if (attr != DEV_DMA_NOT_SUPPORTED) { err = acpi_dma_configure(&cdev->dev, attr); if (err) { @@ -1723,11 +1536,13 @@ static int svm_acpi_add_core(struct svm_device *sdev, } }
- err = acpi_dev_prop_read_single(children, "hisi,smmu-bypass", - DEV_PROP_U8, &cdev->smmu_bypass); + err = acpi_dev_get_property(children, "hisi,smmu-bypass", + DEV_PROP_U8, &obj); if (err) dev_info(&children->dev, "read smmu bypass failed\n");
+ cdev->smmu_bypass = *(u8 *)obj->integer.value; + cdev->group = iommu_group_get(&cdev->dev); if (IS_ERR_OR_NULL(cdev->group)) { dev_err(&cdev->dev, "smmu is not right configured\n");
From: Lijun Fang fanglijun3@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4JMM0 CVE: NA -------------------
Set CONFIG_HISI_SVM as m by default
Signed-off-by: Lijun Fang fanglijun3@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/configs/openeuler_defconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index fb5b9650b458..83691c2ad677 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -3300,7 +3300,7 @@ CONFIG_TCG_TIS_ST33ZP24_I2C=y CONFIG_TCG_TIS_ST33ZP24_SPI=y # CONFIG_XILLYBUS is not set CONFIG_PIN_MEMORY_DEV=m -CONFIG_HISI_SVM=y +CONFIG_HISI_SVM=m # end of Character devices
# CONFIG_RANDOM_TRUST_CPU is not set
From: Dave Ertman david.m.ertman@intel.com
mainline inclusion from mainline-v5.11-rc1 commit 7de3697e9cbd4bd3d62bafa249d57990e1b8f294 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4O662 CVE: NA
-------------------------------------------------
Add support for the Auxiliary Bus, auxiliary_device and auxiliary_driver. It enables drivers to create an auxiliary_device and bind an auxiliary_driver to it.
The bus supports probe/remove shutdown and suspend/resume callbacks. Each auxiliary_device has a unique string based id; driver binds to an auxiliary_device based on this id through the bus.
Co-developed-by: Kiran Patil kiran.patil@intel.com Co-developed-by: Ranjani Sridharan ranjani.sridharan@linux.intel.com Co-developed-by: Fred Oh fred.oh@linux.intel.com Co-developed-by: Leon Romanovsky leonro@nvidia.com Signed-off-by: Kiran Patil kiran.patil@intel.com Signed-off-by: Ranjani Sridharan ranjani.sridharan@linux.intel.com Signed-off-by: Fred Oh fred.oh@linux.intel.com Signed-off-by: Leon Romanovsky leonro@nvidia.com Signed-off-by: Dave Ertman david.m.ertman@intel.com Reviewed-by: Pierre-Louis Bossart pierre-louis.bossart@linux.intel.com Reviewed-by: Shiraz Saleem shiraz.saleem@intel.com Reviewed-by: Parav Pandit parav@mellanox.com Reviewed-by: Dan Williams dan.j.williams@intel.com Reviewed-by: Martin Habets mhabets@solarflare.com Link: https://lore.kernel.org/r/20201113161859.1775473-2-david.m.ertman@intel.com Signed-off-by: Dan Williams dan.j.williams@intel.com Link: https://lore.kernel.org/r/160695681289.505290.8978295443574440604.stgit@dwil... Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yuanzheng Song songyuanzheng@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- Documentation/driver-api/auxiliary_bus.rst | 234 ++++++++++++++++++ Documentation/driver-api/index.rst | 1 + drivers/base/Kconfig | 3 + drivers/base/Makefile | 1 + drivers/base/auxiliary.c | 268 +++++++++++++++++++++ include/linux/auxiliary_bus.h | 78 ++++++ include/linux/mod_devicetable.h | 8 + scripts/mod/devicetable-offsets.c | 3 + scripts/mod/file2alias.c | 8 + 9 files changed, 604 insertions(+) create mode 100644 Documentation/driver-api/auxiliary_bus.rst create mode 100644 drivers/base/auxiliary.c create mode 100644 include/linux/auxiliary_bus.h
diff --git a/Documentation/driver-api/auxiliary_bus.rst b/Documentation/driver-api/auxiliary_bus.rst new file mode 100644 index 000000000000..5dd7804631ef --- /dev/null +++ b/Documentation/driver-api/auxiliary_bus.rst @@ -0,0 +1,234 @@ +.. SPDX-License-Identifier: GPL-2.0-only + +============= +Auxiliary Bus +============= + +In some subsystems, the functionality of the core device (PCI/ACPI/other) is +too complex for a single device to be managed by a monolithic driver +(e.g. Sound Open Firmware), multiple devices might implement a common +intersection of functionality (e.g. NICs + RDMA), or a driver may want to +export an interface for another subsystem to drive (e.g. SIOV Physical Function +export Virtual Function management). A split of the functinoality into child- +devices representing sub-domains of functionality makes it possible to +compartmentalize, layer, and distribute domain-specific concerns via a Linux +device-driver model. + +An example for this kind of requirement is the audio subsystem where a single +IP is handling multiple entities such as HDMI, Soundwire, local devices such as +mics/speakers etc. The split for the core's functionality can be arbitrary or +be defined by the DSP firmware topology and include hooks for test/debug. This +allows for the audio core device to be minimal and focused on hardware-specific +control and communication. + +Each auxiliary_device represents a part of its parent functionality. The +generic behavior can be extended and specialized as needed by encapsulating an +auxiliary_device within other domain-specific structures and the use of .ops +callbacks. Devices on the auxiliary bus do not share any structures and the use +of a communication channel with the parent is domain-specific. + +Note that ops are intended as a way to augment instance behavior within a class +of auxiliary devices, it is not the mechanism for exporting common +infrastructure from the parent. Consider EXPORT_SYMBOL_NS() to convey +infrastructure from the parent module to the auxiliary module(s). + + +When Should the Auxiliary Bus Be Used +===================================== + +The auxiliary bus is to be used when a driver and one or more kernel modules, +who share a common header file with the driver, need a mechanism to connect and +provide access to a shared object allocated by the auxiliary_device's +registering driver. The registering driver for the auxiliary_device(s) and the +kernel module(s) registering auxiliary_drivers can be from the same subsystem, +or from multiple subsystems. + +The emphasis here is on a common generic interface that keeps subsystem +customization out of the bus infrastructure. + +One example is a PCI network device that is RDMA-capable and exports a child +device to be driven by an auxiliary_driver in the RDMA subsystem. The PCI +driver allocates and registers an auxiliary_device for each physical +function on the NIC. The RDMA driver registers an auxiliary_driver that claims +each of these auxiliary_devices. This conveys data/ops published by the parent +PCI device/driver to the RDMA auxiliary_driver. + +Another use case is for the PCI device to be split out into multiple sub +functions. For each sub function an auxiliary_device is created. A PCI sub +function driver binds to such devices that creates its own one or more class +devices. A PCI sub function auxiliary device is likely to be contained in a +struct with additional attributes such as user defined sub function number and +optional attributes such as resources and a link to the parent device. These +attributes could be used by systemd/udev; and hence should be initialized +before a driver binds to an auxiliary_device. + +A key requirement for utilizing the auxiliary bus is that there is no +dependency on a physical bus, device, register accesses or regmap support. +These individual devices split from the core cannot live on the platform bus as +they are not physical devices that are controlled by DT/ACPI. The same +argument applies for not using MFD in this scenario as MFD relies on individual +function devices being physical devices. + +Auxiliary Device +================ + +An auxiliary_device represents a part of its parent device's functionality. It +is given a name that, combined with the registering drivers KBUILD_MODNAME, +creates a match_name that is used for driver binding, and an id that combined +with the match_name provide a unique name to register with the bus subsystem. + +Registering an auxiliary_device is a two-step process. First call +auxiliary_device_init(), which checks several aspects of the auxiliary_device +struct and performs a device_initialize(). After this step completes, any +error state must have a call to auxiliary_device_uninit() in its resolution path. +The second step in registering an auxiliary_device is to perform a call to +auxiliary_device_add(), which sets the name of the device and add the device to +the bus. + +Unregistering an auxiliary_device is also a two-step process to mirror the +register process. First call auxiliary_device_delete(), then call +auxiliary_device_uninit(). + +.. code-block:: c + + struct auxiliary_device { + struct device dev; + const char *name; + u32 id; + }; + +If two auxiliary_devices both with a match_name "mod.foo" are registered onto +the bus, they must have unique id values (e.g. "x" and "y") so that the +registered devices names are "mod.foo.x" and "mod.foo.y". If match_name + id +are not unique, then the device_add fails and generates an error message. + +The auxiliary_device.dev.type.release or auxiliary_device.dev.release must be +populated with a non-NULL pointer to successfully register the auxiliary_device. + +The auxiliary_device.dev.parent must also be populated. + +Auxiliary Device Memory Model and Lifespan +------------------------------------------ + +The registering driver is the entity that allocates memory for the +auxiliary_device and register it on the auxiliary bus. It is important to note +that, as opposed to the platform bus, the registering driver is wholly +responsible for the management for the memory used for the driver object. + +A parent object, defined in the shared header file, contains the +auxiliary_device. It also contains a pointer to the shared object(s), which +also is defined in the shared header. Both the parent object and the shared +object(s) are allocated by the registering driver. This layout allows the +auxiliary_driver's registering module to perform a container_of() call to go +from the pointer to the auxiliary_device, that is passed during the call to the +auxiliary_driver's probe function, up to the parent object, and then have +access to the shared object(s). + +The memory for the auxiliary_device is freed only in its release() callback +flow as defined by its registering driver. + +The memory for the shared object(s) must have a lifespan equal to, or greater +than, the lifespan of the memory for the auxiliary_device. The auxiliary_driver +should only consider that this shared object is valid as long as the +auxiliary_device is still registered on the auxiliary bus. It is up to the +registering driver to manage (e.g. free or keep available) the memory for the +shared object beyond the life of the auxiliary_device. + +The registering driver must unregister all auxiliary devices before its own +driver.remove() is completed. + +Auxiliary Drivers +================= + +Auxiliary drivers follow the standard driver model convention, where +discovery/enumeration is handled by the core, and drivers +provide probe() and remove() methods. They support power management +and shutdown notifications using the standard conventions. + +.. code-block:: c + + struct auxiliary_driver { + int (*probe)(struct auxiliary_device *, + const struct auxiliary_device_id *id); + int (*remove)(struct auxiliary_device *); + void (*shutdown)(struct auxiliary_device *); + int (*suspend)(struct auxiliary_device *, pm_message_t); + int (*resume)(struct auxiliary_device *); + struct device_driver driver; + const struct auxiliary_device_id *id_table; + }; + +Auxiliary drivers register themselves with the bus by calling +auxiliary_driver_register(). The id_table contains the match_names of auxiliary +devices that a driver can bind with. + +Example Usage +============= + +Auxiliary devices are created and registered by a subsystem-level core device +that needs to break up its functionality into smaller fragments. One way to +extend the scope of an auxiliary_device is to encapsulate it within a domain- +pecific structure defined by the parent device. This structure contains the +auxiliary_device and any associated shared data/callbacks needed to establish +the connection with the parent. + +An example is: + +.. code-block:: c + + struct foo { + struct auxiliary_device auxdev; + void (*connect)(struct auxiliary_device *auxdev); + void (*disconnect)(struct auxiliary_device *auxdev); + void *data; + }; + +The parent device then registers the auxiliary_device by calling +auxiliary_device_init(), and then auxiliary_device_add(), with the pointer to +the auxdev member of the above structure. The parent provides a name for the +auxiliary_device that, combined with the parent's KBUILD_MODNAME, creates a +match_name that is be used for matching and binding with a driver. + +Whenever an auxiliary_driver is registered, based on the match_name, the +auxiliary_driver's probe() is invoked for the matching devices. The +auxiliary_driver can also be encapsulated inside custom drivers that make the +core device's functionality extensible by adding additional domain-specific ops +as follows: + +.. code-block:: c + + struct my_ops { + void (*send)(struct auxiliary_device *auxdev); + void (*receive)(struct auxiliary_device *auxdev); + }; + + + struct my_driver { + struct auxiliary_driver auxiliary_drv; + const struct my_ops ops; + }; + +An example of this type of usage is: + +.. code-block:: c + + const struct auxiliary_device_id my_auxiliary_id_table[] = { + { .name = "foo_mod.foo_dev" }, + { }, + }; + + const struct my_ops my_custom_ops = { + .send = my_tx, + .receive = my_rx, + }; + + const struct my_driver my_drv = { + .auxiliary_drv = { + .name = "myauxiliarydrv", + .id_table = my_auxiliary_id_table, + .probe = my_probe, + .remove = my_remove, + .shutdown = my_shutdown, + }, + .ops = my_custom_ops, + }; diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst index f357f3eb400c..86759a74b7f1 100644 --- a/Documentation/driver-api/index.rst +++ b/Documentation/driver-api/index.rst @@ -72,6 +72,7 @@ available subsections can be seen below. thermal/index fpga/index acpi/index + auxiliary_bus backlight/lp855x-driver.rst connector console diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig index 8d7001712062..040be48ce046 100644 --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig @@ -1,6 +1,9 @@ # SPDX-License-Identifier: GPL-2.0 menu "Generic Driver Options"
+config AUXILIARY_BUS + bool + config UEVENT_HELPER bool "Support for uevent helper" help diff --git a/drivers/base/Makefile b/drivers/base/Makefile index 41369fc7004f..5e7bf9669a81 100644 --- a/drivers/base/Makefile +++ b/drivers/base/Makefile @@ -7,6 +7,7 @@ obj-y := component.o core.o bus.o dd.o syscore.o \ attribute_container.o transport_class.o \ topology.o container.o property.o cacheinfo.o \ swnode.o +obj-$(CONFIG_AUXILIARY_BUS) += auxiliary.o obj-$(CONFIG_DEVTMPFS) += devtmpfs.o obj-y += power/ obj-$(CONFIG_ISA_BUS_API) += isa.o diff --git a/drivers/base/auxiliary.c b/drivers/base/auxiliary.c new file mode 100644 index 000000000000..ef2af417438b --- /dev/null +++ b/drivers/base/auxiliary.c @@ -0,0 +1,268 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2019-2020 Intel Corporation + * + * Please see Documentation/driver-api/auxiliary_bus.rst for more information. + */ + +#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__ + +#include <linux/device.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/pm_domain.h> +#include <linux/pm_runtime.h> +#include <linux/string.h> +#include <linux/auxiliary_bus.h> + +static const struct auxiliary_device_id *auxiliary_match_id(const struct auxiliary_device_id *id, + const struct auxiliary_device *auxdev) +{ + for (; id->name[0]; id++) { + const char *p = strrchr(dev_name(&auxdev->dev), '.'); + int match_size; + + if (!p) + continue; + match_size = p - dev_name(&auxdev->dev); + + /* use dev_name(&auxdev->dev) prefix before last '.' char to match to */ + if (strlen(id->name) == match_size && + !strncmp(dev_name(&auxdev->dev), id->name, match_size)) + return id; + } + return NULL; +} + +static int auxiliary_match(struct device *dev, struct device_driver *drv) +{ + struct auxiliary_device *auxdev = to_auxiliary_dev(dev); + struct auxiliary_driver *auxdrv = to_auxiliary_drv(drv); + + return !!auxiliary_match_id(auxdrv->id_table, auxdev); +} + +static int auxiliary_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + const char *name, *p; + + name = dev_name(dev); + p = strrchr(name, '.'); + + return add_uevent_var(env, "MODALIAS=%s%.*s", AUXILIARY_MODULE_PREFIX, (int)(p - name), + name); +} + +static const struct dev_pm_ops auxiliary_dev_pm_ops = { + SET_RUNTIME_PM_OPS(pm_generic_runtime_suspend, pm_generic_runtime_resume, NULL) + SET_SYSTEM_SLEEP_PM_OPS(pm_generic_suspend, pm_generic_resume) +}; + +static int auxiliary_bus_probe(struct device *dev) +{ + struct auxiliary_driver *auxdrv = to_auxiliary_drv(dev->driver); + struct auxiliary_device *auxdev = to_auxiliary_dev(dev); + int ret; + + ret = dev_pm_domain_attach(dev, true); + if (ret) { + dev_warn(dev, "Failed to attach to PM Domain : %d\n", ret); + return ret; + } + + ret = auxdrv->probe(auxdev, auxiliary_match_id(auxdrv->id_table, auxdev)); + if (ret) + dev_pm_domain_detach(dev, true); + + return ret; +} + +static int auxiliary_bus_remove(struct device *dev) +{ + struct auxiliary_driver *auxdrv = to_auxiliary_drv(dev->driver); + struct auxiliary_device *auxdev = to_auxiliary_dev(dev); + int ret = 0; + + if (auxdrv->remove) + ret = auxdrv->remove(auxdev); + dev_pm_domain_detach(dev, true); + + return ret; +} + +static void auxiliary_bus_shutdown(struct device *dev) +{ + struct auxiliary_driver *auxdrv = to_auxiliary_drv(dev->driver); + struct auxiliary_device *auxdev = to_auxiliary_dev(dev); + + if (auxdrv->shutdown) + auxdrv->shutdown(auxdev); +} + +static struct bus_type auxiliary_bus_type = { + .name = "auxiliary", + .probe = auxiliary_bus_probe, + .remove = auxiliary_bus_remove, + .shutdown = auxiliary_bus_shutdown, + .match = auxiliary_match, + .uevent = auxiliary_uevent, + .pm = &auxiliary_dev_pm_ops, +}; + +/** + * auxiliary_device_init - check auxiliary_device and initialize + * @auxdev: auxiliary device struct + * + * This is the first step in the two-step process to register an auxiliary_device. + * + * When this function returns an error code, then the device_initialize will *not* have + * been performed, and the caller will be responsible to free any memory allocated for the + * auxiliary_device in the error path directly. + * + * It returns 0 on success. On success, the device_initialize has been performed. After this + * point any error unwinding will need to include a call to auxiliary_device_uninit(). + * In this post-initialize error scenario, a call to the device's .release callback will be + * triggered, and all memory clean-up is expected to be handled there. + */ +int auxiliary_device_init(struct auxiliary_device *auxdev) +{ + struct device *dev = &auxdev->dev; + + if (!dev->parent) { + pr_err("auxiliary_device has a NULL dev->parent\n"); + return -EINVAL; + } + + if (!auxdev->name) { + pr_err("auxiliary_device has a NULL name\n"); + return -EINVAL; + } + + dev->bus = &auxiliary_bus_type; + device_initialize(&auxdev->dev); + return 0; +} +EXPORT_SYMBOL_GPL(auxiliary_device_init); + +/** + * __auxiliary_device_add - add an auxiliary bus device + * @auxdev: auxiliary bus device to add to the bus + * @modname: name of the parent device's driver module + * + * This is the second step in the two-step process to register an auxiliary_device. + * + * This function must be called after a successful call to auxiliary_device_init(), which + * will perform the device_initialize. This means that if this returns an error code, then a + * call to auxiliary_device_uninit() must be performed so that the .release callback will + * be triggered to free the memory associated with the auxiliary_device. + * + * The expectation is that users will call the "auxiliary_device_add" macro so that the caller's + * KBUILD_MODNAME is automatically inserted for the modname parameter. Only if a user requires + * a custom name would this version be called directly. + */ +int __auxiliary_device_add(struct auxiliary_device *auxdev, const char *modname) +{ + struct device *dev = &auxdev->dev; + int ret; + + if (!modname) { + pr_err("auxiliary device modname is NULL\n"); + return -EINVAL; + } + + ret = dev_set_name(dev, "%s.%s.%d", modname, auxdev->name, auxdev->id); + if (ret) { + pr_err("auxiliary device dev_set_name failed: %d\n", ret); + return ret; + } + + ret = device_add(dev); + if (ret) + dev_err(dev, "adding auxiliary device failed!: %d\n", ret); + + return ret; +} +EXPORT_SYMBOL_GPL(__auxiliary_device_add); + +/** + * auxiliary_find_device - auxiliary device iterator for locating a particular device. + * @start: Device to begin with + * @data: Data to pass to match function + * @match: Callback function to check device + * + * This function returns a reference to a device that is 'found' + * for later use, as determined by the @match callback. + * + * The callback should return 0 if the device doesn't match and non-zero + * if it does. If the callback returns non-zero, this function will + * return to the caller and not iterate over any more devices. + */ +struct auxiliary_device * +auxiliary_find_device(struct device *start, const void *data, + int (*match)(struct device *dev, const void *data)) +{ + struct device *dev; + + dev = bus_find_device(&auxiliary_bus_type, start, data, match); + if (!dev) + return NULL; + + return to_auxiliary_dev(dev); +} +EXPORT_SYMBOL_GPL(auxiliary_find_device); + +/** + * __auxiliary_driver_register - register a driver for auxiliary bus devices + * @auxdrv: auxiliary_driver structure + * @owner: owning module/driver + * @modname: KBUILD_MODNAME for parent driver + */ +int __auxiliary_driver_register(struct auxiliary_driver *auxdrv, struct module *owner, + const char *modname) +{ + if (WARN_ON(!auxdrv->probe) || WARN_ON(!auxdrv->id_table)) + return -EINVAL; + + if (auxdrv->name) + auxdrv->driver.name = kasprintf(GFP_KERNEL, "%s.%s", modname, auxdrv->name); + else + auxdrv->driver.name = kasprintf(GFP_KERNEL, "%s", modname); + if (!auxdrv->driver.name) + return -ENOMEM; + + auxdrv->driver.owner = owner; + auxdrv->driver.bus = &auxiliary_bus_type; + auxdrv->driver.mod_name = modname; + + return driver_register(&auxdrv->driver); +} +EXPORT_SYMBOL_GPL(__auxiliary_driver_register); + +/** + * auxiliary_driver_unregister - unregister a driver + * @auxdrv: auxiliary_driver structure + */ +void auxiliary_driver_unregister(struct auxiliary_driver *auxdrv) +{ + driver_unregister(&auxdrv->driver); + kfree(auxdrv->driver.name); +} +EXPORT_SYMBOL_GPL(auxiliary_driver_unregister); + +static int __init auxiliary_bus_init(void) +{ + return bus_register(&auxiliary_bus_type); +} + +static void __exit auxiliary_bus_exit(void) +{ + bus_unregister(&auxiliary_bus_type); +} + +module_init(auxiliary_bus_init); +module_exit(auxiliary_bus_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("Auxiliary Bus"); +MODULE_AUTHOR("David Ertman david.m.ertman@intel.com"); +MODULE_AUTHOR("Kiran Patil kiran.patil@intel.com"); diff --git a/include/linux/auxiliary_bus.h b/include/linux/auxiliary_bus.h new file mode 100644 index 000000000000..282fbf7bf9af --- /dev/null +++ b/include/linux/auxiliary_bus.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2019-2020 Intel Corporation + * + * Please see Documentation/driver-api/auxiliary_bus.rst for more information. + */ + +#ifndef _AUXILIARY_BUS_H_ +#define _AUXILIARY_BUS_H_ + +#include <linux/device.h> +#include <linux/mod_devicetable.h> +#include <linux/slab.h> + +struct auxiliary_device { + struct device dev; + const char *name; + u32 id; +}; + +struct auxiliary_driver { + int (*probe)(struct auxiliary_device *auxdev, const struct auxiliary_device_id *id); + int (*remove)(struct auxiliary_device *auxdev); + void (*shutdown)(struct auxiliary_device *auxdev); + int (*suspend)(struct auxiliary_device *auxdev, pm_message_t state); + int (*resume)(struct auxiliary_device *auxdev); + const char *name; + struct device_driver driver; + const struct auxiliary_device_id *id_table; +}; + +static inline struct auxiliary_device *to_auxiliary_dev(struct device *dev) +{ + return container_of(dev, struct auxiliary_device, dev); +} + +static inline struct auxiliary_driver *to_auxiliary_drv(struct device_driver *drv) +{ + return container_of(drv, struct auxiliary_driver, driver); +} + +int auxiliary_device_init(struct auxiliary_device *auxdev); +int __auxiliary_device_add(struct auxiliary_device *auxdev, const char *modname); +#define auxiliary_device_add(auxdev) __auxiliary_device_add(auxdev, KBUILD_MODNAME) + +static inline void auxiliary_device_uninit(struct auxiliary_device *auxdev) +{ + put_device(&auxdev->dev); +} + +static inline void auxiliary_device_delete(struct auxiliary_device *auxdev) +{ + device_del(&auxdev->dev); +} + +int __auxiliary_driver_register(struct auxiliary_driver *auxdrv, struct module *owner, + const char *modname); +#define auxiliary_driver_register(auxdrv) \ + __auxiliary_driver_register(auxdrv, THIS_MODULE, KBUILD_MODNAME) + +void auxiliary_driver_unregister(struct auxiliary_driver *auxdrv); + +/** + * module_auxiliary_driver() - Helper macro for registering an auxiliary driver + * @__auxiliary_driver: auxiliary driver struct + * + * Helper macro for auxiliary drivers which do not do anything special in + * module init/exit. This eliminates a lot of boilerplate. Each module may only + * use this macro once, and calling it replaces module_init() and module_exit() + */ +#define module_auxiliary_driver(__auxiliary_driver) \ + module_driver(__auxiliary_driver, auxiliary_driver_register, auxiliary_driver_unregister) + +struct auxiliary_device * +auxiliary_find_device(struct device *start, const void *data, + int (*match)(struct device *dev, const void *data)); + +#endif /* _AUXILIARY_BUS_H_ */ diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 5b08a473cdba..c425290b21e2 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -838,4 +838,12 @@ struct mhi_device_id { kernel_ulong_t driver_data; };
+#define AUXILIARY_NAME_SIZE 32 +#define AUXILIARY_MODULE_PREFIX "auxiliary:" + +struct auxiliary_device_id { + char name[AUXILIARY_NAME_SIZE]; + kernel_ulong_t driver_data; +}; + #endif /* LINUX_MOD_DEVICETABLE_H */ diff --git a/scripts/mod/devicetable-offsets.c b/scripts/mod/devicetable-offsets.c index 27007c18e754..e377f52dbfa3 100644 --- a/scripts/mod/devicetable-offsets.c +++ b/scripts/mod/devicetable-offsets.c @@ -243,5 +243,8 @@ int main(void) DEVID(mhi_device_id); DEVID_FIELD(mhi_device_id, chan);
+ DEVID(auxiliary_device_id); + DEVID_FIELD(auxiliary_device_id, name); + return 0; } diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c index 2417dd1dee33..fb4827027536 100644 --- a/scripts/mod/file2alias.c +++ b/scripts/mod/file2alias.c @@ -1364,6 +1364,13 @@ static int do_mhi_entry(const char *filename, void *symval, char *alias) { DEF_FIELD_ADDR(symval, mhi_device_id, chan); sprintf(alias, MHI_DEVICE_MODALIAS_FMT, *chan); + return 1; +} + +static int do_auxiliary_entry(const char *filename, void *symval, char *alias) +{ + DEF_FIELD_ADDR(symval, auxiliary_device_id, name); + sprintf(alias, AUXILIARY_MODULE_PREFIX "%s", *name);
return 1; } @@ -1442,6 +1449,7 @@ static const struct devtable devtable[] = { {"tee", SIZE_tee_client_device_id, do_tee_entry}, {"wmi", SIZE_wmi_device_id, do_wmi_entry}, {"mhi", SIZE_mhi_device_id, do_mhi_entry}, + {"auxiliary", SIZE_auxiliary_device_id, do_auxiliary_entry}, };
/* Create MODULE_ALIAS() statements.
From: Greg Kroah-Hartman gregkh@linuxfoundation.org
mainline inclusion from mainline-v5.11-rc1 commit 7bbb79ff5f7499e0c5d65987458410e8099207d8 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4O662 CVE: NA
-------------------------------------------------
No need to include slab.h in include/linux/auxiliary_bus.h, as it is not needed there. Move it to drivers/base/auxiliary.c instead.
Cc: Dan Williams dan.j.williams@intel.com Cc: Dave Ertman david.m.ertman@intel.com Cc: Fred Oh fred.oh@linux.intel.com Cc: Kiran Patil kiran.patil@intel.com Cc: Leon Romanovsky leonro@nvidia.com Cc: Martin Habets mhabets@solarflare.com Cc: Parav Pandit parav@mellanox.com Cc: Pierre-Louis Bossart pierre-louis.bossart@linux.intel.com Cc: Ranjani Sridharan ranjani.sridharan@linux.intel.com Cc: Shiraz Saleem shiraz.saleem@intel.com Link: https://lore.kernel.org/r/X8og8xi3WkoYXet9@kroah.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yuanzheng Song songyuanzheng@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/base/auxiliary.c | 1 + include/linux/auxiliary_bus.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/base/auxiliary.c b/drivers/base/auxiliary.c index ef2af417438b..eca36d6284d0 100644 --- a/drivers/base/auxiliary.c +++ b/drivers/base/auxiliary.c @@ -9,6 +9,7 @@
#include <linux/device.h> #include <linux/init.h> +#include <linux/slab.h> #include <linux/module.h> #include <linux/pm_domain.h> #include <linux/pm_runtime.h> diff --git a/include/linux/auxiliary_bus.h b/include/linux/auxiliary_bus.h index 282fbf7bf9af..3580743d0e8d 100644 --- a/include/linux/auxiliary_bus.h +++ b/include/linux/auxiliary_bus.h @@ -10,7 +10,6 @@
#include <linux/device.h> #include <linux/mod_devicetable.h> -#include <linux/slab.h>
struct auxiliary_device { struct device dev;
From: Greg Kroah-Hartman gregkh@linuxfoundation.org
mainline inclusion from mainline-v5.11-rc1 commit 8142a46c50d2dd8160c42284e1044eed3bec0d18 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4O662 CVE: NA
-------------------------------------------------
There's an effort to move the remove() callback in the driver core to not return an int, as nothing can be done if this function fails. To make that effort easier, make the aux bus remove function void to start with so that no users have to be changed sometime in the future.
Cc: Dan Williams dan.j.williams@intel.com Cc: Dave Ertman david.m.ertman@intel.com Cc: Fred Oh fred.oh@linux.intel.com Cc: Kiran Patil kiran.patil@intel.com Cc: Leon Romanovsky leonro@nvidia.com Cc: Martin Habets mhabets@solarflare.com Cc: Parav Pandit parav@mellanox.com Cc: Pierre-Louis Bossart pierre-louis.bossart@linux.intel.com Cc: Ranjani Sridharan ranjani.sridharan@linux.intel.com Cc: Shiraz Saleem shiraz.saleem@intel.com Link: https://lore.kernel.org/r/X8ohB1ks1NK7kPop@kroah.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yuanzheng Song songyuanzheng@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- Documentation/driver-api/auxiliary_bus.rst | 2 +- drivers/base/auxiliary.c | 5 ++--- include/linux/auxiliary_bus.h | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-)
diff --git a/Documentation/driver-api/auxiliary_bus.rst b/Documentation/driver-api/auxiliary_bus.rst index 5dd7804631ef..2312506b0674 100644 --- a/Documentation/driver-api/auxiliary_bus.rst +++ b/Documentation/driver-api/auxiliary_bus.rst @@ -150,7 +150,7 @@ and shutdown notifications using the standard conventions. struct auxiliary_driver { int (*probe)(struct auxiliary_device *, const struct auxiliary_device_id *id); - int (*remove)(struct auxiliary_device *); + void (*remove)(struct auxiliary_device *); void (*shutdown)(struct auxiliary_device *); int (*suspend)(struct auxiliary_device *, pm_message_t); int (*resume)(struct auxiliary_device *); diff --git a/drivers/base/auxiliary.c b/drivers/base/auxiliary.c index eca36d6284d0..c44e85802b43 100644 --- a/drivers/base/auxiliary.c +++ b/drivers/base/auxiliary.c @@ -82,13 +82,12 @@ static int auxiliary_bus_remove(struct device *dev) { struct auxiliary_driver *auxdrv = to_auxiliary_drv(dev->driver); struct auxiliary_device *auxdev = to_auxiliary_dev(dev); - int ret = 0;
if (auxdrv->remove) - ret = auxdrv->remove(auxdev); + auxdrv->remove(auxdev); dev_pm_domain_detach(dev, true);
- return ret; + return 0; }
static void auxiliary_bus_shutdown(struct device *dev) diff --git a/include/linux/auxiliary_bus.h b/include/linux/auxiliary_bus.h index 3580743d0e8d..d67b17606210 100644 --- a/include/linux/auxiliary_bus.h +++ b/include/linux/auxiliary_bus.h @@ -19,7 +19,7 @@ struct auxiliary_device {
struct auxiliary_driver { int (*probe)(struct auxiliary_device *auxdev, const struct auxiliary_device_id *id); - int (*remove)(struct auxiliary_device *auxdev); + void (*remove)(struct auxiliary_device *auxdev); void (*shutdown)(struct auxiliary_device *auxdev); int (*suspend)(struct auxiliary_device *auxdev, pm_message_t state); int (*resume)(struct auxiliary_device *auxdev);
From: Greg Kroah-Hartman gregkh@linuxfoundation.org
mainline inclusion from mainline-v5.11-rc1 commit 0d2bf11a6b3e275a526b8d42d8d4a3a6067cf953 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4O662 CVE: NA
-------------------------------------------------
For some reason, the original aux bus patch had some really long lines in a few places, probably due to it being a very long-lived patch in development by many different people. Fix that up so that the two files all have the same length lines and function formatting styles.
Cc: Dan Williams dan.j.williams@intel.com Cc: Dave Ertman david.m.ertman@intel.com Cc: Fred Oh fred.oh@linux.intel.com Cc: Kiran Patil kiran.patil@intel.com Cc: Leon Romanovsky leonro@nvidia.com Cc: Martin Habets mhabets@solarflare.com Cc: Parav Pandit parav@mellanox.com Cc: Pierre-Louis Bossart pierre-louis.bossart@linux.intel.com Cc: Ranjani Sridharan ranjani.sridharan@linux.intel.com Cc: Shiraz Saleem shiraz.saleem@intel.com Link: https://lore.kernel.org/r/X8oiSFTpYHw1xE/o@kroah.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yuanzheng Song songyuanzheng@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/base/auxiliary.c | 58 +++++++++++++++++++---------------- include/linux/auxiliary_bus.h | 6 ++-- 2 files changed, 35 insertions(+), 29 deletions(-)
diff --git a/drivers/base/auxiliary.c b/drivers/base/auxiliary.c index c44e85802b43..f303daadf843 100644 --- a/drivers/base/auxiliary.c +++ b/drivers/base/auxiliary.c @@ -50,8 +50,8 @@ static int auxiliary_uevent(struct device *dev, struct kobj_uevent_env *env) name = dev_name(dev); p = strrchr(name, '.');
- return add_uevent_var(env, "MODALIAS=%s%.*s", AUXILIARY_MODULE_PREFIX, (int)(p - name), - name); + return add_uevent_var(env, "MODALIAS=%s%.*s", AUXILIARY_MODULE_PREFIX, + (int)(p - name), name); }
static const struct dev_pm_ops auxiliary_dev_pm_ops = { @@ -113,16 +113,18 @@ static struct bus_type auxiliary_bus_type = { * auxiliary_device_init - check auxiliary_device and initialize * @auxdev: auxiliary device struct * - * This is the first step in the two-step process to register an auxiliary_device. + * This is the first step in the two-step process to register an + * auxiliary_device. * - * When this function returns an error code, then the device_initialize will *not* have - * been performed, and the caller will be responsible to free any memory allocated for the - * auxiliary_device in the error path directly. + * When this function returns an error code, then the device_initialize will + * *not* have been performed, and the caller will be responsible to free any + * memory allocated for the auxiliary_device in the error path directly. * - * It returns 0 on success. On success, the device_initialize has been performed. After this - * point any error unwinding will need to include a call to auxiliary_device_uninit(). - * In this post-initialize error scenario, a call to the device's .release callback will be - * triggered, and all memory clean-up is expected to be handled there. + * It returns 0 on success. On success, the device_initialize has been + * performed. After this point any error unwinding will need to include a call + * to auxiliary_device_uninit(). In this post-initialize error scenario, a call + * to the device's .release callback will be triggered, and all memory clean-up + * is expected to be handled there. */ int auxiliary_device_init(struct auxiliary_device *auxdev) { @@ -149,16 +151,19 @@ EXPORT_SYMBOL_GPL(auxiliary_device_init); * @auxdev: auxiliary bus device to add to the bus * @modname: name of the parent device's driver module * - * This is the second step in the two-step process to register an auxiliary_device. + * This is the second step in the two-step process to register an + * auxiliary_device. * - * This function must be called after a successful call to auxiliary_device_init(), which - * will perform the device_initialize. This means that if this returns an error code, then a - * call to auxiliary_device_uninit() must be performed so that the .release callback will - * be triggered to free the memory associated with the auxiliary_device. + * This function must be called after a successful call to + * auxiliary_device_init(), which will perform the device_initialize. This + * means that if this returns an error code, then a call to + * auxiliary_device_uninit() must be performed so that the .release callback + * will be triggered to free the memory associated with the auxiliary_device. * - * The expectation is that users will call the "auxiliary_device_add" macro so that the caller's - * KBUILD_MODNAME is automatically inserted for the modname parameter. Only if a user requires - * a custom name would this version be called directly. + * The expectation is that users will call the "auxiliary_device_add" macro so + * that the caller's KBUILD_MODNAME is automatically inserted for the modname + * parameter. Only if a user requires a custom name would this version be + * called directly. */ int __auxiliary_device_add(struct auxiliary_device *auxdev, const char *modname) { @@ -166,13 +171,13 @@ int __auxiliary_device_add(struct auxiliary_device *auxdev, const char *modname) int ret;
if (!modname) { - pr_err("auxiliary device modname is NULL\n"); + dev_err(dev, "auxiliary device modname is NULL\n"); return -EINVAL; }
ret = dev_set_name(dev, "%s.%s.%d", modname, auxdev->name, auxdev->id); if (ret) { - pr_err("auxiliary device dev_set_name failed: %d\n", ret); + dev_err(dev, "auxiliary device dev_set_name failed: %d\n", ret); return ret; }
@@ -197,9 +202,9 @@ EXPORT_SYMBOL_GPL(__auxiliary_device_add); * if it does. If the callback returns non-zero, this function will * return to the caller and not iterate over any more devices. */ -struct auxiliary_device * -auxiliary_find_device(struct device *start, const void *data, - int (*match)(struct device *dev, const void *data)) +struct auxiliary_device *auxiliary_find_device(struct device *start, + const void *data, + int (*match)(struct device *dev, const void *data)) { struct device *dev;
@@ -217,14 +222,15 @@ EXPORT_SYMBOL_GPL(auxiliary_find_device); * @owner: owning module/driver * @modname: KBUILD_MODNAME for parent driver */ -int __auxiliary_driver_register(struct auxiliary_driver *auxdrv, struct module *owner, - const char *modname) +int __auxiliary_driver_register(struct auxiliary_driver *auxdrv, + struct module *owner, const char *modname) { if (WARN_ON(!auxdrv->probe) || WARN_ON(!auxdrv->id_table)) return -EINVAL;
if (auxdrv->name) - auxdrv->driver.name = kasprintf(GFP_KERNEL, "%s.%s", modname, auxdrv->name); + auxdrv->driver.name = kasprintf(GFP_KERNEL, "%s.%s", modname, + auxdrv->name); else auxdrv->driver.name = kasprintf(GFP_KERNEL, "%s", modname); if (!auxdrv->driver.name) diff --git a/include/linux/auxiliary_bus.h b/include/linux/auxiliary_bus.h index d67b17606210..fc51d45f106b 100644 --- a/include/linux/auxiliary_bus.h +++ b/include/linux/auxiliary_bus.h @@ -70,8 +70,8 @@ void auxiliary_driver_unregister(struct auxiliary_driver *auxdrv); #define module_auxiliary_driver(__auxiliary_driver) \ module_driver(__auxiliary_driver, auxiliary_driver_register, auxiliary_driver_unregister)
-struct auxiliary_device * -auxiliary_find_device(struct device *start, const void *data, - int (*match)(struct device *dev, const void *data)); +struct auxiliary_device *auxiliary_find_device(struct device *start, + const void *data, + int (*match)(struct device *dev, const void *data));
#endif /* _AUXILIARY_BUS_H_ */
From: Dave Jiang dave.jiang@intel.com
mainline inclusion from mainline-v5.11-rc1 commit 784b2c48ac12dcee27db001fb1a3c58c39380cb6 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4O662 CVE: NA
-------------------------------------------------
If the probe of the auxdrv failed, the device->driver is set to NULL. During kernel shutdown, the bus shutdown will call auxdrv->shutdown and cause an invalid ptr dereference. Add check to make sure device->driver is not NULL before we proceed.
Fixes: 7de3697e9cbd ("Add auxiliary bus support") Cc: Dave Ertman david.m.ertman@intel.com Signed-off-by: Dave Jiang dave.jiang@intel.com Reviewed-by: Dan Williams dan.j.williams@intel.com Link: https://lore.kernel.org/r/160710040926.1889434.8840329810698403478.stgit@dji... Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yuanzheng Song songyuanzheng@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/base/auxiliary.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/drivers/base/auxiliary.c b/drivers/base/auxiliary.c index f303daadf843..8336535f1e11 100644 --- a/drivers/base/auxiliary.c +++ b/drivers/base/auxiliary.c @@ -92,10 +92,15 @@ static int auxiliary_bus_remove(struct device *dev)
static void auxiliary_bus_shutdown(struct device *dev) { - struct auxiliary_driver *auxdrv = to_auxiliary_drv(dev->driver); - struct auxiliary_device *auxdev = to_auxiliary_dev(dev); + struct auxiliary_driver *auxdrv = NULL; + struct auxiliary_device *auxdev; + + if (dev->driver) { + auxdrv = to_auxiliary_drv(dev->driver); + auxdev = to_auxiliary_dev(dev); + }
- if (auxdrv->shutdown) + if (auxdrv && auxdrv->shutdown) auxdrv->shutdown(auxdev); }
From: Dave Jiang dave.jiang@intel.com
mainline inclusion from mainline-v5.12-rc1 commit 471b12c43f376d5203dbff0e91316eea11f6f4df category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4O662 CVE: NA
-------------------------------------------------
When the auxiliary device code is built into the kernel, it can be executed before the auxiliary bus is registered. This causes bus->p to be not allocated and triggers a NULL pointer dereference when the auxiliary bus device gets added with bus_add_device(). Call the auxiliary_bus_init() under driver_init() so the bus is initialized before devices.
Below is the kernel splat for the bug: [ 1.948215] BUG: kernel NULL pointer dereference, address: 0000000000000060 [ 1.950670] #PF: supervisor read access in kernel mode [ 1.950670] #PF: error_code(0x0000) - not-present page [ 1.950670] PGD 0 [ 1.950670] Oops: 0000 1 SMP NOPTI [ 1.950670] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.10.0-intel-nextsvmtest+ #2205 [ 1.950670] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [ 1.950670] RIP: 0010:bus_add_device+0x64/0x140 [ 1.950670] Code: 00 49 8b 75 20 48 89 df e8 59 a1 ff ff 41 89 c4 85 c0 75 7b 48 8b 53 50 48 85 d2 75 03 48 8b 13 49 8b 85 a0 00 00 00 48 89 de <48> 8 78 60 48 83 c7 18 e8 ef d9 a9 ff 41 89 c4 85 c0 75 45 48 8b [ 1.950670] RSP: 0000:ff46032ac001baf8 EFLAGS: 00010246 [ 1.950670] RAX: 0000000000000000 RBX: ff4597f7414aa680 RCX: 0000000000000000 [ 1.950670] RDX: ff4597f74142bbc0 RSI: ff4597f7414aa680 RDI: ff4597f7414aa680 [ 1.950670] RBP: ff46032ac001bb10 R08: 0000000000000044 R09: 0000000000000228 [ 1.950670] R10: ff4597f741141b30 R11: ff4597f740182a90 R12: 0000000000000000 [ 1.950670] R13: ffffffffa5e936c0 R14: 0000000000000000 R15: 0000000000000000 [ 1.950670] FS: 0000000000000000(0000) GS:ff4597f7bba00000(0000) knlGS:0000000000000000 [ 1.950670] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1.950670] CR2: 0000000000000060 CR3: 000000002140c001 CR4: 0000000000f71ef0 [ 1.950670] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1.950670] DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400 [ 1.950670] PKRU: 55555554 [ 1.950670] Call Trace: [ 1.950670] device_add+0x3ee/0x850 [ 1.950670] __auxiliary_device_add+0x47/0x60 [ 1.950670] idxd_pci_probe+0xf77/0x1180 [ 1.950670] local_pci_probe+0x4a/0x90 [ 1.950670] pci_device_probe+0xff/0x1b0 [ 1.950670] really_probe+0x1cf/0x440 [ 1.950670] ? rdinit_setup+0x31/0x31 [ 1.950670] driver_probe_device+0xe8/0x150 [ 1.950670] device_driver_attach+0x58/0x60 [ 1.950670] __driver_attach+0x8f/0x150 [ 1.950670] ? device_driver_attach+0x60/0x60 [ 1.950670] ? device_driver_attach+0x60/0x60 [ 1.950670] bus_for_each_dev+0x79/0xc0 [ 1.950670] ? kmem_cache_alloc_trace+0x323/0x430 [ 1.950670] driver_attach+0x1e/0x20 [ 1.950670] bus_add_driver+0x154/0x1f0 [ 1.950670] driver_register+0x70/0xc0 [ 1.950670] __pci_register_driver+0x54/0x60 [ 1.950670] idxd_init_module+0xe2/0xfc [ 1.950670] ? idma64_platform_driver_init+0x19/0x19 [ 1.950670] do_one_initcall+0x4a/0x1e0 [ 1.950670] kernel_init_freeable+0x1fc/0x25c [ 1.950670] ? rest_init+0xba/0xba [ 1.950670] kernel_init+0xe/0x116 [ 1.950670] ret_from_fork+0x1f/0x30 [ 1.950670] Modules linked in: [ 1.950670] CR2: 0000000000000060 [ 1.950670] --[ end trace cd7d1b226d3ca901 ]--
Fixes: 7de3697e9cbd ("Add auxiliary bus support") Reported-by: Jacob Pan jacob.jun.pan@intel.com Reviewed-by: Dan Williams dan.j.williams@intel.com Acked-by: Dave Ertman david.m.ertman@intel.com Signed-off-by: Dave Jiang dave.jiang@intel.com Link: https://lore.kernel.org/r/20210210201611.1611074-1-dave.jiang@intel.com Cc: stable stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yuanzheng Song songyuanzheng@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/base/auxiliary.c | 13 +++---------- drivers/base/base.h | 5 +++++ drivers/base/init.c | 1 + 3 files changed, 9 insertions(+), 10 deletions(-)
diff --git a/drivers/base/auxiliary.c b/drivers/base/auxiliary.c index 8336535f1e11..d8b314e7d0fd 100644 --- a/drivers/base/auxiliary.c +++ b/drivers/base/auxiliary.c @@ -15,6 +15,7 @@ #include <linux/pm_runtime.h> #include <linux/string.h> #include <linux/auxiliary_bus.h> +#include "base.h"
static const struct auxiliary_device_id *auxiliary_match_id(const struct auxiliary_device_id *id, const struct auxiliary_device *auxdev) @@ -260,19 +261,11 @@ void auxiliary_driver_unregister(struct auxiliary_driver *auxdrv) } EXPORT_SYMBOL_GPL(auxiliary_driver_unregister);
-static int __init auxiliary_bus_init(void) +void __init auxiliary_bus_init(void) { - return bus_register(&auxiliary_bus_type); + WARN_ON(bus_register(&auxiliary_bus_type)); }
-static void __exit auxiliary_bus_exit(void) -{ - bus_unregister(&auxiliary_bus_type); -} - -module_init(auxiliary_bus_init); -module_exit(auxiliary_bus_exit); - MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("Auxiliary Bus"); MODULE_AUTHOR("David Ertman david.m.ertman@intel.com"); diff --git a/drivers/base/base.h b/drivers/base/base.h index 91cfb8405abd..7d97447460fa 100644 --- a/drivers/base/base.h +++ b/drivers/base/base.h @@ -119,6 +119,11 @@ static inline int hypervisor_init(void) { return 0; } extern int platform_bus_init(void); extern void cpu_dev_init(void); extern void container_dev_init(void); +#ifdef CONFIG_AUXILIARY_BUS +extern void auxiliary_bus_init(void); +#else +static inline void auxiliary_bus_init(void) { } +#endif
struct kobject *virtual_device_parent(struct device *dev);
diff --git a/drivers/base/init.c b/drivers/base/init.c index 908e6520e804..a9f57c22fb9e 100644 --- a/drivers/base/init.c +++ b/drivers/base/init.c @@ -32,6 +32,7 @@ void __init driver_init(void) */ of_core_init(); platform_bus_init(); + auxiliary_bus_init(); cpu_dev_init(); memory_dev_init(); container_dev_init();
From: Dave Jiang dave.jiang@intel.com
mainline inclusion from mainline-v5.13-rc1 commit bbf44abeeabfe05a124535e6c3a9fd7d682d42bf category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4O662 CVE: NA
-------------------------------------------------
Remove module bits in the auxiliary bus code since the auxiliary bus cannot be built as a module and the relevant code is not needed.
Cc: Dave Ertman david.m.ertman@intel.com Suggested-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Dave Jiang dave.jiang@intel.com Link: https://lore.kernel.org/r/161307488980.1896017.15627190714413338196.stgit@dj... Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yuanzheng Song songyuanzheng@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/base/auxiliary.c | 5 ----- 1 file changed, 5 deletions(-)
diff --git a/drivers/base/auxiliary.c b/drivers/base/auxiliary.c index d8b314e7d0fd..adc199dfba3c 100644 --- a/drivers/base/auxiliary.c +++ b/drivers/base/auxiliary.c @@ -265,8 +265,3 @@ void __init auxiliary_bus_init(void) { WARN_ON(bus_register(&auxiliary_bus_type)); } - -MODULE_LICENSE("GPL v2"); -MODULE_DESCRIPTION("Auxiliary Bus"); -MODULE_AUTHOR("David Ertman david.m.ertman@intel.com"); -MODULE_AUTHOR("Kiran Patil kiran.patil@intel.com");
From: Peter Ujfalusi peter.ujfalusi@linux.intel.com
mainline inclusion from mainline-v5.14-rc3 commit 4afa0c22eed33cfe0c590742387f0d16f32412f3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4O662 CVE: NA
-------------------------------------------------
If driver_register() returns with error we need to free the memory allocated for auxdrv->driver.name before returning from __auxiliary_driver_register()
Fixes: 7de3697e9cbd4 ("Add auxiliary bus support") Reviewed-by: Dan Williams dan.j.williams@intel.com Cc: stable stable@vger.kernel.org Signed-off-by: Peter Ujfalusi peter.ujfalusi@linux.intel.com Link: https://lore.kernel.org/r/20210713093438.3173-1-peter.ujfalusi@linux.intel.c... Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yuanzheng Song songyuanzheng@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/base/auxiliary.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/drivers/base/auxiliary.c b/drivers/base/auxiliary.c index adc199dfba3c..6a30264ab2ba 100644 --- a/drivers/base/auxiliary.c +++ b/drivers/base/auxiliary.c @@ -231,6 +231,8 @@ EXPORT_SYMBOL_GPL(auxiliary_find_device); int __auxiliary_driver_register(struct auxiliary_driver *auxdrv, struct module *owner, const char *modname) { + int ret; + if (WARN_ON(!auxdrv->probe) || WARN_ON(!auxdrv->id_table)) return -EINVAL;
@@ -246,7 +248,11 @@ int __auxiliary_driver_register(struct auxiliary_driver *auxdrv, auxdrv->driver.bus = &auxiliary_bus_type; auxdrv->driver.mod_name = modname;
- return driver_register(&auxdrv->driver); + ret = driver_register(&auxdrv->driver); + if (ret) + kfree(auxdrv->driver.name); + + return ret; } EXPORT_SYMBOL_GPL(__auxiliary_driver_register);
From: Daniel Bristot de Oliveira bristot@redhat.com
mainline inclusion from mainline-v5.14-rc1 commit bc87cf0a08d437ea192b15f0918cb581a8698f15 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4G64B CVE: NA
-------------------------------------------------
The hwlat detector and (in preparation for) the osnoise/timerlat tracers have a set of u64 parameters that the user can read/write via tracefs. For instance, we have hwlat_detector's window and width.
To reduce the code duplication, hwlat's window and width share the same read function. However, they do not share the write functions because they do different parameter checks. For instance, the width needs to be smaller than the window, while the window needs to be larger than the window. The same pattern repeats on osnoise/timerlat, and a large portion of the code was devoted to the write function.
Despite having different checks, the write functions have the same structure:
read a user-space buffer take the lock that protects the value check for minimum and maximum acceptable values save the value release the lock return success or error
To reduce the code duplication also in the write functions, this patch provides a generic read and write implementation for u64 values that need to be within some minimum and/or maximum parameters, while (potentially) being protected by a lock.
To use this interface, the structure trace_min_max_param needs to be filled:
struct trace_min_max_param { struct mutex *lock; u64 *val; u64 *min; u64 *max; };
The desired value is stored on the variable pointed by *val. If *min points to a minimum acceptable value, it will be checked during the write operation. Likewise, if *max points to a maximum allowable value, it will be checked during the write operation. Finally, if *lock points to a mutex, it will be taken at the beginning of the operation and released at the end.
The definition of a trace_min_max_param needs to passed as the (private) *data for tracefs_create_file(), and the trace_min_max_fops (added by this patch) as the *fops file_operations.
Link: https://lkml.kernel.org/r/3e35760a7c8b5c55f16ae5ad5fc54a0e71cbe647.162437231...
Cc: Phil Auld pauld@redhat.com Cc: Sebastian Andrzej Siewior bigeasy@linutronix.de Cc: Kate Carcia kcarcia@redhat.com Cc: Jonathan Corbet corbet@lwn.net Cc: Ingo Molnar mingo@redhat.com Cc: Peter Zijlstra peterz@infradead.org Cc: Thomas Gleixner tglx@linutronix.de Cc: Alexandre Chartre alexandre.chartre@oracle.com Cc: Clark Willaims williams@redhat.com Cc: John Kacur jkacur@redhat.com Cc: Juri Lelli juri.lelli@redhat.com Cc: Borislav Petkov bp@alien8.de Cc: "H. Peter Anvin" hpa@zytor.com Cc: x86@kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira bristot@redhat.com Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/trace/trace.c | 85 ++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace.h | 18 ++++++++++ 2 files changed, 103 insertions(+)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2ce366687ce4..a29a722f1982 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7228,6 +7228,91 @@ static const struct file_operations snapshot_raw_fops = {
#endif /* CONFIG_TRACER_SNAPSHOT */
+/* + * trace_min_max_write - Write a u64 value to a trace_min_max_param struct + * @filp: The active open file structure + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * + * This function implements the write interface for a struct trace_min_max_param. + * The filp->private_data must point to a trace_min_max_param structure that + * defines where to write the value, the min and the max acceptable values, + * and a lock to protect the write. + */ +static ssize_t +trace_min_max_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_min_max_param *param = filp->private_data; + u64 val; + int err; + + if (!param) + return -EFAULT; + + err = kstrtoull_from_user(ubuf, cnt, 10, &val); + if (err) + return err; + + if (param->lock) + mutex_lock(param->lock); + + if (param->min && val < *param->min) + err = -EINVAL; + + if (param->max && val > *param->max) + err = -EINVAL; + + if (!err) + *param->val = val; + + if (param->lock) + mutex_unlock(param->lock); + + if (err) + return err; + + return cnt; +} + +/* + * trace_min_max_read - Read a u64 value from a trace_min_max_param struct + * @filp: The active open file structure + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * + * This function implements the read interface for a struct trace_min_max_param. + * The filp->private_data must point to a trace_min_max_param struct with valid + * data. + */ +static ssize_t +trace_min_max_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_min_max_param *param = filp->private_data; + char buf[U64_STR_SIZE]; + int len; + u64 val; + + if (!param) + return -EFAULT; + + val = *param->val; + + if (cnt > sizeof(buf)) + cnt = sizeof(buf); + + len = snprintf(buf, sizeof(buf), "%llu\n", val); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, len); +} + +const struct file_operations trace_min_max_fops = { + .open = tracing_open_generic, + .read = trace_min_max_read, + .write = trace_min_max_write, +}; + #define TRACING_LOG_ERRS_MAX 8 #define TRACING_LOG_LOC_MAX 128
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 94f8087a3c22..419456b3f52f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -2105,4 +2105,22 @@ static inline bool is_good_name(const char *name) return true; }
+/* + * This is a generic way to read and write a u64 value from a file in tracefs. + * + * The value is stored on the variable pointed by *val. The value needs + * to be at least *min and at most *max. The write is protected by an + * existing *lock. + */ +struct trace_min_max_param { + struct mutex *lock; + u64 *val; + u64 *min; + u64 *max; +}; + +#define U64_STR_SIZE 24 /* 20 digits max */ + +extern const struct file_operations trace_min_max_fops; + #endif /* _LINUX_KERNEL_TRACE_H */
From: Steven Rostedt rostedt@goodmis.org
mainline inclusion from mainline-v5.14-rc1 commit 62de4f29e9174e67beb8d34ef5ced6730e087a31 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4G64B CVE: NA
-------------------------------------------------
To have nanosecond output displayed in a more human readable format, its nicer to convert it to a seconds format (XXX.YYYYYYYYY). The problem is that to do so, the numbers must be divided by NSEC_PER_SEC, and moded too. But as these numbers are 64 bit, this can not be done simply with '/' and '%' operators, but must use do_div() instead.
Instead of performing the expensive do_div() in the hot path of the tracepoint, it is more efficient to perform it during the output phase. But passing in do_div() can confuse the parser, and do_div() doesn't work exactly like a normal C function. It modifies the number in place, and we don't want to modify the actual values in the ring buffer.
Two helper functions are now created:
__print_ns_to_secs() and __print_ns_without_secs()
They both take a value of nanoseconds, and the former will return that number divided by NSEC_PER_SEC, and the latter will mod it with NSEC_PER_SEC giving a way to print a nice human readable format:
__print_fmt("time=%llu.%09u", __print_ns_to_secs(REC->nsec_val), __print_ns_without_secs(REC->nsec_val))
Link: https://lkml.kernel.org/r/e503b903045496c4ccde52843e1e318b422f7a56.162437231...
Cc: Phil Auld pauld@redhat.com Cc: Sebastian Andrzej Siewior bigeasy@linutronix.de Cc: Kate Carcia kcarcia@redhat.com Cc: Jonathan Corbet corbet@lwn.net Cc: Ingo Molnar mingo@redhat.com Cc: Peter Zijlstra peterz@infradead.org Cc: Thomas Gleixner tglx@linutronix.de Cc: Alexandre Chartre alexandre.chartre@oracle.com Cc: Clark Willaims williams@redhat.com Cc: John Kacur jkacur@redhat.com Cc: Juri Lelli juri.lelli@redhat.com Cc: Borislav Petkov bp@alien8.de Cc: "H. Peter Anvin" hpa@zytor.com Cc: x86@kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Steven Rostedt rostedt@goodmis.org Signed-off-by: Daniel Bristot de Oliveira bristot@redhat.com Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/trace/trace_events.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+)
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index 7785961d82ba..ea34810846ea 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -347,6 +347,21 @@ TRACE_MAKE_SYSTEM_STR(); trace_print_hex_dump_seq(p, prefix_str, prefix_type, \ rowsize, groupsize, buf, len, ascii)
+#undef __print_ns_to_secs +#define __print_ns_to_secs(value) \ + ({ \ + u64 ____val = (u64)(value); \ + do_div(____val, NSEC_PER_SEC); \ + ____val; \ + }) + +#undef __print_ns_without_secs +#define __print_ns_without_secs(value) \ + ({ \ + u64 ____val = (u64)(value); \ + (u32) do_div(____val, NSEC_PER_SEC); \ + }) + #undef DECLARE_EVENT_CLASS #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ static notrace enum print_line_t \ @@ -725,6 +740,16 @@ static inline void ftrace_test_probe_##call(void) \ #undef __print_array #undef __print_hex_dump
+/* + * The below is not executed in the kernel. It is only what is + * displayed in the print format for userspace to parse. + */ +#undef __print_ns_to_secs +#define __print_ns_to_secs(val) (val) / 1000000000UL + +#undef __print_ns_without_secs +#define __print_ns_without_secs(val) (val) % 1000000000UL + #undef TP_printk #define TP_printk(fmt, args...) """ fmt "", " __stringify(args)
From: Sebastian Andrzej Siewior bigeasy@linutronix.de
mainline inclusion from mainline-v5.12-rc1 commit 36590c50b2d0729952511129916beeea30d31d81 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4G64B CVE: NA
-------------------------------------------------
The state of the interrupts (irqflags) and the preemption counter are both passed down to tracing_generic_entry_update(). Only one bit of irqflags is actually required: The on/off state. The complete 32bit of the preemption counter isn't needed. Just whether of the upper bits (softirq, hardirq and NMI) are set and the preemption depth is needed.
The irqflags and the preemption counter could be evaluated early and the information stored in an integer `trace_ctx'. tracing_generic_entry_update() would use the upper bits as the TRACE_FLAG_* and the lower 8bit as the disabled-preemption depth (considering that one must be substracted from the counter in one special cases).
The actual preemption value is not used except for the tracing record. The `irqflags' variable is mostly used only for the tracing record. An exception here is for instance wakeup_tracer_call() or probe_wakeup_sched_switch() which explicilty disable interrupts and use that `irqflags' to save (and restore) the IRQ state and to record the state.
Struct trace_event_buffer has also the `pc' and flags' members which can be replaced with `trace_ctx' since their actual value is not used outside of trace recording.
This will reduce tracing_generic_entry_update() to simply assign values to struct trace_entry. The evaluation of the TRACE_FLAG_* bits is moved to _tracing_gen_ctx_flags() which replaces preempt_count() and local_save_flags() invocations.
As an example, ftrace_syscall_enter() may invoke: - trace_buffer_lock_reserve() -> … -> tracing_generic_entry_update() - event_trigger_unlock_commit() -> ftrace_trace_stack() -> … -> tracing_generic_entry_update() -> ftrace_trace_userstack() -> … -> tracing_generic_entry_update()
In this case the TRACE_FLAG_* bits were evaluated three times. By using the `trace_ctx' they are evaluated once and assigned three times.
A build with all tracers enabled on x86-64 with and without the patch:
text data bss dec hex filename 21970669 17084168 7639260 46694097 2c87ed1 vmlinux.old 21970293 17084168 7639260 46693721 2c87d59 vmlinux.new
text shrank by 379 bytes, data remained constant.
Link: https://lkml.kernel.org/r/20210125194511.3924915-2-bigeasy@linutronix.de
Signed-off-by: Sebastian Andrzej Siewior bigeasy@linutronix.de Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/trace_events.h | 25 +++- kernel/trace/blktrace.c | 17 +-- kernel/trace/trace.c | 208 ++++++++++++++------------- kernel/trace/trace.h | 38 +++-- kernel/trace/trace_branch.c | 6 +- kernel/trace/trace_event_perf.c | 5 +- kernel/trace/trace_events.c | 18 +-- kernel/trace/trace_events_inject.c | 6 +- kernel/trace/trace_functions.c | 28 ++-- kernel/trace/trace_functions_graph.c | 32 ++--- kernel/trace/trace_hwlat.c | 7 +- kernel/trace/trace_irqsoff.c | 86 +++++------ kernel/trace/trace_kprobe.c | 10 +- kernel/trace/trace_mmiotrace.c | 14 +- kernel/trace/trace_sched_wakeup.c | 71 +++++---- kernel/trace/trace_syscalls.c | 20 ++- kernel/trace/trace_uprobe.c | 4 +- 17 files changed, 287 insertions(+), 308 deletions(-)
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index d321fe5ad1a1..091250b0895a 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -148,17 +148,29 @@ enum print_line_t {
enum print_line_t trace_handle_return(struct trace_seq *s);
-void tracing_generic_entry_update(struct trace_entry *entry, - unsigned short type, - unsigned long flags, - int pc); +static inline void tracing_generic_entry_update(struct trace_entry *entry, + unsigned short type, + unsigned int trace_ctx) +{ + struct task_struct *tsk = current; + + entry->preempt_count = trace_ctx & 0xff; + entry->pid = (tsk) ? tsk->pid : 0; + entry->type = type; + entry->flags = trace_ctx >> 16; +} + +unsigned int tracing_gen_ctx_flags(unsigned long irqflags); +unsigned int tracing_gen_ctx(void); +unsigned int tracing_gen_ctx_dec(void); + struct trace_event_file;
struct ring_buffer_event * trace_event_buffer_lock_reserve(struct trace_buffer **current_buffer, struct trace_event_file *trace_file, int type, unsigned long len, - unsigned long flags, int pc); + unsigned int trace_ctx);
#define TRACE_RECORD_CMDLINE BIT(0) #define TRACE_RECORD_TGID BIT(1) @@ -232,8 +244,7 @@ struct trace_event_buffer { struct ring_buffer_event *event; struct trace_event_file *trace_file; void *entry; - unsigned long flags; - int pc; + unsigned int trace_ctx; struct pt_regs *regs; };
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index b89ff188a618..b3afd103b3aa 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -72,17 +72,17 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, struct blk_io_trace *t; struct ring_buffer_event *event = NULL; struct trace_buffer *buffer = NULL; - int pc = 0; + unsigned int trace_ctx = 0; int cpu = smp_processor_id(); bool blk_tracer = blk_tracer_enabled; ssize_t cgid_len = cgid ? sizeof(cgid) : 0;
if (blk_tracer) { buffer = blk_tr->array_buffer.buffer; - pc = preempt_count(); + trace_ctx = tracing_gen_ctx_flags(0); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, sizeof(*t) + len + cgid_len, - 0, pc); + trace_ctx); if (!event) return; t = ring_buffer_event_data(event); @@ -107,7 +107,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, memcpy((void *) t + sizeof(*t) + cgid_len, data, len);
if (blk_tracer) - trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc); + trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); } }
@@ -222,8 +222,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, struct blk_io_trace *t; unsigned long flags = 0; unsigned long *sequence; + unsigned int trace_ctx = 0; pid_t pid; - int cpu, pc = 0; + int cpu; bool blk_tracer = blk_tracer_enabled; ssize_t cgid_len = cgid ? sizeof(cgid) : 0;
@@ -252,10 +253,10 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, tracing_record_cmdline(current);
buffer = blk_tr->array_buffer.buffer; - pc = preempt_count(); + trace_ctx = tracing_gen_ctx_flags(0); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, sizeof(*t) + pdu_len + cgid_len, - 0, pc); + trace_ctx); if (!event) return; t = ring_buffer_event_data(event); @@ -301,7 +302,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len);
if (blk_tracer) { - trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc); + trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); return; } } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a29a722f1982..2573a42e270a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -176,7 +176,7 @@ static union trace_eval_map_item *trace_eval_maps; int tracing_set_tracer(struct trace_array *tr, const char *buf); static void ftrace_trace_userstack(struct trace_array *tr, struct trace_buffer *buffer, - unsigned long flags, int pc); + unsigned int trace_ctx);
#define MAX_TRACER_SIZE 100 static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; @@ -905,23 +905,23 @@ static inline void trace_access_lock_init(void)
#ifdef CONFIG_STACKTRACE static void __ftrace_trace_stack(struct trace_buffer *buffer, - unsigned long flags, - int skip, int pc, struct pt_regs *regs); + unsigned int trace_ctx, + int skip, struct pt_regs *regs); static inline void ftrace_trace_stack(struct trace_array *tr, struct trace_buffer *buffer, - unsigned long flags, - int skip, int pc, struct pt_regs *regs); + unsigned int trace_ctx, + int skip, struct pt_regs *regs);
#else static inline void __ftrace_trace_stack(struct trace_buffer *buffer, - unsigned long flags, - int skip, int pc, struct pt_regs *regs) + unsigned int trace_ctx, + int skip, struct pt_regs *regs) { } static inline void ftrace_trace_stack(struct trace_array *tr, struct trace_buffer *buffer, - unsigned long flags, - int skip, int pc, struct pt_regs *regs) + unsigned long trace_ctx, + int skip, struct pt_regs *regs) { }
@@ -929,24 +929,24 @@ static inline void ftrace_trace_stack(struct trace_array *tr,
static __always_inline void trace_event_setup(struct ring_buffer_event *event, - int type, unsigned long flags, int pc) + int type, unsigned int trace_ctx) { struct trace_entry *ent = ring_buffer_event_data(event);
- tracing_generic_entry_update(ent, type, flags, pc); + tracing_generic_entry_update(ent, type, trace_ctx); }
static __always_inline struct ring_buffer_event * __trace_buffer_lock_reserve(struct trace_buffer *buffer, int type, unsigned long len, - unsigned long flags, int pc) + unsigned int trace_ctx) { struct ring_buffer_event *event;
event = ring_buffer_lock_reserve(buffer, len); if (event != NULL) - trace_event_setup(event, type, flags, pc); + trace_event_setup(event, type, trace_ctx);
return event; } @@ -1007,25 +1007,22 @@ int __trace_puts(unsigned long ip, const char *str, int size) struct ring_buffer_event *event; struct trace_buffer *buffer; struct print_entry *entry; - unsigned long irq_flags; + unsigned int trace_ctx; int alloc; - int pc;
if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) return 0;
- pc = preempt_count(); - if (unlikely(tracing_selftest_running || tracing_disabled)) return 0;
alloc = sizeof(*entry) + size + 2; /* possible \n added */
- local_save_flags(irq_flags); + trace_ctx = tracing_gen_ctx(); buffer = global_trace.array_buffer.buffer; ring_buffer_nest_start(buffer); - event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, - irq_flags, pc); + event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, + trace_ctx); if (!event) { size = 0; goto out; @@ -1044,7 +1041,7 @@ int __trace_puts(unsigned long ip, const char *str, int size) entry->buf[size] = '\0';
__buffer_unlock_commit(buffer, event); - ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL); + ftrace_trace_stack(&global_trace, buffer, trace_ctx, 4, NULL); out: ring_buffer_nest_end(buffer); return size; @@ -1061,25 +1058,22 @@ int __trace_bputs(unsigned long ip, const char *str) struct ring_buffer_event *event; struct trace_buffer *buffer; struct bputs_entry *entry; - unsigned long irq_flags; + unsigned int trace_ctx; int size = sizeof(struct bputs_entry); int ret = 0; - int pc;
if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) return 0;
- pc = preempt_count(); - if (unlikely(tracing_selftest_running || tracing_disabled)) return 0;
- local_save_flags(irq_flags); + trace_ctx = tracing_gen_ctx(); buffer = global_trace.array_buffer.buffer;
ring_buffer_nest_start(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, - irq_flags, pc); + trace_ctx); if (!event) goto out;
@@ -1088,7 +1082,7 @@ int __trace_bputs(unsigned long ip, const char *str) entry->str = str;
__buffer_unlock_commit(buffer, event); - ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL); + ftrace_trace_stack(&global_trace, buffer, trace_ctx, 4, NULL);
ret = 1; out: @@ -2592,36 +2586,69 @@ enum print_line_t trace_handle_return(struct trace_seq *s) } EXPORT_SYMBOL_GPL(trace_handle_return);
-void -tracing_generic_entry_update(struct trace_entry *entry, unsigned short type, - unsigned long flags, int pc) +unsigned int tracing_gen_ctx_flags(unsigned long irqflags) { - struct task_struct *tsk = current; + unsigned int trace_flags = 0; + unsigned int pc; + + pc = preempt_count();
- entry->preempt_count = pc & 0xff; - entry->pid = (tsk) ? tsk->pid : 0; - entry->type = type; - entry->flags = #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT - (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | + if (irqs_disabled_flags(irqflags)) + trace_flags |= TRACE_FLAG_IRQS_OFF; #else - TRACE_FLAG_IRQS_NOSUPPORT | + trace_flags |= TRACE_FLAG_IRQS_NOSUPPORT; #endif - ((pc & NMI_MASK ) ? TRACE_FLAG_NMI : 0) | - ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | - ((pc & SOFTIRQ_OFFSET) ? TRACE_FLAG_SOFTIRQ : 0) | - (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) | - (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0); + + if (pc & NMI_MASK) + trace_flags |= TRACE_FLAG_NMI; + if (pc & HARDIRQ_MASK) + trace_flags |= TRACE_FLAG_HARDIRQ; + + if (pc & SOFTIRQ_OFFSET) + trace_flags |= TRACE_FLAG_SOFTIRQ; + + if (tif_need_resched()) + trace_flags |= TRACE_FLAG_NEED_RESCHED; + if (test_preempt_need_resched()) + trace_flags |= TRACE_FLAG_PREEMPT_RESCHED; + return (trace_flags << 16) | (pc & 0xff); +} + +unsigned int tracing_gen_ctx(void) +{ + unsigned long irqflags; + +#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT + local_save_flags(irqflags); +#else + irqflags = 0; +#endif + return tracing_gen_ctx_flags(irqflags); +} + +unsigned int tracing_gen_ctx_dec(void) +{ + unsigned int trace_ctx; + + trace_ctx = tracing_gen_ctx(); + + /* + * Subtract one from the preeption counter if preemption is enabled, + * see trace_event_buffer_reserve()for details. + */ + if (IS_ENABLED(CONFIG_PREEMPTION)) + trace_ctx--; + return trace_ctx; } -EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
struct ring_buffer_event * trace_buffer_lock_reserve(struct trace_buffer *buffer, int type, unsigned long len, - unsigned long flags, int pc) + unsigned int trace_ctx) { - return __trace_buffer_lock_reserve(buffer, type, len, flags, pc); + return __trace_buffer_lock_reserve(buffer, type, len, trace_ctx); }
DEFINE_PER_CPU(struct ring_buffer_event *, trace_buffered_event); @@ -2741,7 +2768,7 @@ struct ring_buffer_event * trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, struct trace_event_file *trace_file, int type, unsigned long len, - unsigned long flags, int pc) + unsigned int trace_ctx) { struct ring_buffer_event *entry; int val; @@ -2754,15 +2781,15 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, /* Try to use the per cpu buffer first */ val = this_cpu_inc_return(trace_buffered_event_cnt); if ((len < (PAGE_SIZE - sizeof(*entry) - sizeof(entry->array[0]))) && val == 1) { - trace_event_setup(entry, type, flags, pc); + trace_event_setup(entry, type, trace_ctx); entry->array[0] = len; return entry; } this_cpu_dec(trace_buffered_event_cnt); }
- entry = __trace_buffer_lock_reserve(*current_rb, - type, len, flags, pc); + entry = __trace_buffer_lock_reserve(*current_rb, type, len, + trace_ctx); /* * If tracing is off, but we have triggers enabled * we still need to look at the event data. Use the temp_buffer @@ -2771,8 +2798,8 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, */ if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) { *current_rb = temp_buffer; - entry = __trace_buffer_lock_reserve(*current_rb, - type, len, flags, pc); + entry = __trace_buffer_lock_reserve(*current_rb, type, len, + trace_ctx); } return entry; } @@ -2858,7 +2885,7 @@ void trace_event_buffer_commit(struct trace_event_buffer *fbuffer) ftrace_exports(fbuffer->event, TRACE_EXPORT_EVENT); event_trigger_unlock_commit_regs(fbuffer->trace_file, fbuffer->buffer, fbuffer->event, fbuffer->entry, - fbuffer->flags, fbuffer->pc, fbuffer->regs); + fbuffer->trace_ctx, fbuffer->regs); } EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
@@ -2874,7 +2901,7 @@ EXPORT_SYMBOL_GPL(trace_event_buffer_commit); void trace_buffer_unlock_commit_regs(struct trace_array *tr, struct trace_buffer *buffer, struct ring_buffer_event *event, - unsigned long flags, int pc, + unsigned int trace_ctx, struct pt_regs *regs) { __buffer_unlock_commit(buffer, event); @@ -2885,8 +2912,8 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, * and mmiotrace, but that's ok if they lose a function or * two. They are not that meaningful. */ - ftrace_trace_stack(tr, buffer, flags, regs ? 0 : STACK_SKIP, pc, regs); - ftrace_trace_userstack(tr, buffer, flags, pc); + ftrace_trace_stack(tr, buffer, trace_ctx, regs ? 0 : STACK_SKIP, regs); + ftrace_trace_userstack(tr, buffer, trace_ctx); }
/* @@ -2900,9 +2927,8 @@ trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, }
void -trace_function(struct trace_array *tr, - unsigned long ip, unsigned long parent_ip, unsigned long flags, - int pc) +trace_function(struct trace_array *tr, unsigned long ip, unsigned long + parent_ip, unsigned int trace_ctx) { struct trace_event_call *call = &event_function; struct trace_buffer *buffer = tr->array_buffer.buffer; @@ -2910,7 +2936,7 @@ trace_function(struct trace_array *tr, struct ftrace_entry *entry;
event = __trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), - flags, pc); + trace_ctx); if (!event) return; entry = ring_buffer_event_data(event); @@ -2944,8 +2970,8 @@ static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks); static DEFINE_PER_CPU(int, ftrace_stack_reserve);
static void __ftrace_trace_stack(struct trace_buffer *buffer, - unsigned long flags, - int skip, int pc, struct pt_regs *regs) + unsigned int trace_ctx, + int skip, struct pt_regs *regs) { struct trace_event_call *call = &event_kernel_stack; struct ring_buffer_event *event; @@ -2993,7 +3019,7 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, size = nr_entries * sizeof(unsigned long); event = __trace_buffer_lock_reserve(buffer, TRACE_STACK, (sizeof(*entry) - sizeof(entry->caller)) + size, - flags, pc); + trace_ctx); if (!event) goto out; entry = ring_buffer_event_data(event); @@ -3014,22 +3040,22 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
static inline void ftrace_trace_stack(struct trace_array *tr, struct trace_buffer *buffer, - unsigned long flags, - int skip, int pc, struct pt_regs *regs) + unsigned int trace_ctx, + int skip, struct pt_regs *regs) { if (!(tr->trace_flags & TRACE_ITER_STACKTRACE)) return;
- __ftrace_trace_stack(buffer, flags, skip, pc, regs); + __ftrace_trace_stack(buffer, trace_ctx, skip, regs); }
-void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, - int pc) +void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, + int skip) { struct trace_buffer *buffer = tr->array_buffer.buffer;
if (rcu_is_watching()) { - __ftrace_trace_stack(buffer, flags, skip, pc, NULL); + __ftrace_trace_stack(buffer, trace_ctx, skip, NULL); return; }
@@ -3043,7 +3069,7 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, return;
rcu_irq_enter_irqson(); - __ftrace_trace_stack(buffer, flags, skip, pc, NULL); + __ftrace_trace_stack(buffer, trace_ctx, skip, NULL); rcu_irq_exit_irqson(); }
@@ -3053,19 +3079,15 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, */ void trace_dump_stack(int skip) { - unsigned long flags; - if (tracing_disabled || tracing_selftest_running) return;
- local_save_flags(flags); - #ifndef CONFIG_UNWINDER_ORC /* Skip 1 to skip this function. */ skip++; #endif __ftrace_trace_stack(global_trace.array_buffer.buffer, - flags, skip, preempt_count(), NULL); + tracing_gen_ctx(), skip, NULL); } EXPORT_SYMBOL_GPL(trace_dump_stack);
@@ -3074,7 +3096,7 @@ static DEFINE_PER_CPU(int, user_stack_count);
static void ftrace_trace_userstack(struct trace_array *tr, - struct trace_buffer *buffer, unsigned long flags, int pc) + struct trace_buffer *buffer, unsigned int trace_ctx) { struct trace_event_call *call = &event_user_stack; struct ring_buffer_event *event; @@ -3101,7 +3123,7 @@ ftrace_trace_userstack(struct trace_array *tr, __this_cpu_inc(user_stack_count);
event = __trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, - sizeof(*entry), flags, pc); + sizeof(*entry), trace_ctx); if (!event) goto out_drop_count; entry = ring_buffer_event_data(event); @@ -3121,7 +3143,7 @@ ftrace_trace_userstack(struct trace_array *tr, #else /* CONFIG_USER_STACKTRACE_SUPPORT */ static void ftrace_trace_userstack(struct trace_array *tr, struct trace_buffer *buffer, - unsigned long flags, int pc) + unsigned int trace_ctx) { } #endif /* !CONFIG_USER_STACKTRACE_SUPPORT */ @@ -3251,9 +3273,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) struct trace_buffer *buffer; struct trace_array *tr = &global_trace; struct bprint_entry *entry; - unsigned long flags; + unsigned int trace_ctx; char *tbuffer; - int len = 0, size, pc; + int len = 0, size;
if (unlikely(tracing_selftest_running || tracing_disabled)) return 0; @@ -3261,7 +3283,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) /* Don't pollute graph traces with trace_vprintk internals */ pause_graph_tracing();
- pc = preempt_count(); + trace_ctx = tracing_gen_ctx(); preempt_disable_notrace();
tbuffer = get_trace_buf(); @@ -3275,12 +3297,11 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0) goto out_put;
- local_save_flags(flags); size = sizeof(*entry) + sizeof(u32) * len; buffer = tr->array_buffer.buffer; ring_buffer_nest_start(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, - flags, pc); + trace_ctx); if (!event) goto out; entry = ring_buffer_event_data(event); @@ -3290,7 +3311,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) memcpy(entry->buf, tbuffer, sizeof(u32) * len); if (!call_filter_check_discard(call, entry, buffer, event)) { __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL); + ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL); }
out: @@ -3313,9 +3334,9 @@ __trace_array_vprintk(struct trace_buffer *buffer, { struct trace_event_call *call = &event_print; struct ring_buffer_event *event; - int len = 0, size, pc; + int len = 0, size; struct print_entry *entry; - unsigned long flags; + unsigned int trace_ctx; char *tbuffer;
if (tracing_disabled || tracing_selftest_running) @@ -3324,7 +3345,7 @@ __trace_array_vprintk(struct trace_buffer *buffer, /* Don't pollute graph traces with trace_vprintk internals */ pause_graph_tracing();
- pc = preempt_count(); + trace_ctx = tracing_gen_ctx(); preempt_disable_notrace();
@@ -3336,11 +3357,10 @@ __trace_array_vprintk(struct trace_buffer *buffer,
len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
- local_save_flags(flags); size = sizeof(*entry) + len + 1; ring_buffer_nest_start(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, - flags, pc); + trace_ctx); if (!event) goto out; entry = ring_buffer_event_data(event); @@ -3349,7 +3369,7 @@ __trace_array_vprintk(struct trace_buffer *buffer, memcpy(&entry->buf, tbuffer, len + 1); if (!call_filter_check_discard(call, entry, buffer, event)) { __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL); + ftrace_trace_stack(&global_trace, buffer, trace_ctx, 6, NULL); }
out: @@ -6655,7 +6675,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, enum event_trigger_type tt = ETT_NONE; struct trace_buffer *buffer; struct print_entry *entry; - unsigned long irq_flags; ssize_t written; int size; int len; @@ -6675,7 +6694,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
- local_save_flags(irq_flags); size = sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */
/* If less than "<faulted>", then make sure we can still add that */ @@ -6684,7 +6702,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
buffer = tr->array_buffer.buffer; event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, - irq_flags, preempt_count()); + tracing_gen_ctx()); if (unlikely(!event)) /* Ring buffer disabled, return as if not open for write */ return -EBADF; @@ -6736,7 +6754,6 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, struct ring_buffer_event *event; struct trace_buffer *buffer; struct raw_data_entry *entry; - unsigned long irq_flags; ssize_t written; int size; int len; @@ -6758,14 +6775,13 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
- local_save_flags(irq_flags); size = sizeof(*entry) + cnt; if (cnt < FAULT_SIZE_ID) size += FAULT_SIZE_ID - cnt;
buffer = tr->array_buffer.buffer; event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size, - irq_flags, preempt_count()); + tracing_gen_ctx()); if (!event) /* Ring buffer disabled, return as if not open for write */ return -EBADF; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 419456b3f52f..3261691b5756 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -745,8 +745,7 @@ struct ring_buffer_event * trace_buffer_lock_reserve(struct trace_buffer *buffer, int type, unsigned long len, - unsigned long flags, - int pc); + unsigned int trace_ctx);
struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data); @@ -771,11 +770,11 @@ unsigned long trace_total_entries(struct trace_array *tr); void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, - unsigned long flags, int pc); + unsigned int trace_ctx); void trace_graph_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, - unsigned long flags, int pc); + unsigned int trace_ctx); void trace_latency_header(struct seq_file *m); void trace_default_header(struct seq_file *m); void print_trace_header(struct seq_file *m, struct trace_iterator *iter); @@ -841,11 +840,10 @@ static inline void latency_fsnotify(struct trace_array *tr) { } #endif
#ifdef CONFIG_STACKTRACE -void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, - int pc); +void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, int skip); #else -static inline void __trace_stack(struct trace_array *tr, unsigned long flags, - int skip, int pc) +static inline void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, + int skip) { } #endif /* CONFIG_STACKTRACE */ @@ -985,10 +983,10 @@ extern void graph_trace_open(struct trace_iterator *iter); extern void graph_trace_close(struct trace_iterator *iter); extern int __trace_graph_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, - unsigned long flags, int pc); + unsigned int trace_ctx); extern void __trace_graph_return(struct trace_array *tr, struct ftrace_graph_ret *trace, - unsigned long flags, int pc); + unsigned int trace_ctx);
#ifdef CONFIG_DYNAMIC_FTRACE extern struct ftrace_hash __rcu *ftrace_graph_hash; @@ -1451,15 +1449,15 @@ extern int call_filter_check_discard(struct trace_event_call *call, void *rec, void trace_buffer_unlock_commit_regs(struct trace_array *tr, struct trace_buffer *buffer, struct ring_buffer_event *event, - unsigned long flags, int pc, + unsigned int trcace_ctx, struct pt_regs *regs);
static inline void trace_buffer_unlock_commit(struct trace_array *tr, struct trace_buffer *buffer, struct ring_buffer_event *event, - unsigned long flags, int pc) + unsigned int trace_ctx) { - trace_buffer_unlock_commit_regs(tr, buffer, event, flags, pc, NULL); + trace_buffer_unlock_commit_regs(tr, buffer, event, trace_ctx, NULL); }
DECLARE_PER_CPU(struct ring_buffer_event *, trace_buffered_event); @@ -1520,8 +1518,7 @@ __event_trigger_test_discard(struct trace_event_file *file, * @buffer: The ring buffer that the event is being written to * @event: The event meta data in the ring buffer * @entry: The event itself - * @irq_flags: The state of the interrupts at the start of the event - * @pc: The state of the preempt count at the start of the event. + * @trace_ctx: The tracing context flags. * * This is a helper function to handle triggers that require data * from the event itself. It also tests the event against filters and @@ -1531,12 +1528,12 @@ static inline void event_trigger_unlock_commit(struct trace_event_file *file, struct trace_buffer *buffer, struct ring_buffer_event *event, - void *entry, unsigned long irq_flags, int pc) + void *entry, unsigned int trace_ctx) { enum event_trigger_type tt = ETT_NONE;
if (!__event_trigger_test_discard(file, buffer, event, entry, &tt)) - trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc); + trace_buffer_unlock_commit(file->tr, buffer, event, trace_ctx);
if (tt) event_triggers_post_call(file, tt); @@ -1548,8 +1545,7 @@ event_trigger_unlock_commit(struct trace_event_file *file, * @buffer: The ring buffer that the event is being written to * @event: The event meta data in the ring buffer * @entry: The event itself - * @irq_flags: The state of the interrupts at the start of the event - * @pc: The state of the preempt count at the start of the event. + * @trace_ctx: The tracing context flags. * * This is a helper function to handle triggers that require data * from the event itself. It also tests the event against filters and @@ -1562,14 +1558,14 @@ static inline void event_trigger_unlock_commit_regs(struct trace_event_file *file, struct trace_buffer *buffer, struct ring_buffer_event *event, - void *entry, unsigned long irq_flags, int pc, + void *entry, unsigned int trace_ctx, struct pt_regs *regs) { enum event_trigger_type tt = ETT_NONE;
if (!__event_trigger_test_discard(file, buffer, event, entry, &tt)) trace_buffer_unlock_commit_regs(file->tr, buffer, event, - irq_flags, pc, regs); + trace_ctx, regs);
if (tt) event_triggers_post_call(file, tt); diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index eff099123aa2..e47fdb4c92fb 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -37,7 +37,7 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) struct ring_buffer_event *event; struct trace_branch *entry; unsigned long flags; - int pc; + unsigned int trace_ctx; const char *p;
if (current->trace_recursion & TRACE_BRANCH_BIT) @@ -59,10 +59,10 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) if (atomic_read(&data->disabled)) goto out;
- pc = preempt_count(); + trace_ctx = tracing_gen_ctx_flags(flags); buffer = tr->array_buffer.buffer; event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH, - sizeof(*entry), flags, pc); + sizeof(*entry), trace_ctx); if (!event) goto out;
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 643e0b19920d..0443dd61667b 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -421,11 +421,8 @@ NOKPROBE_SYMBOL(perf_trace_buf_alloc); void perf_trace_buf_update(void *record, u16 type) { struct trace_entry *entry = record; - int pc = preempt_count(); - unsigned long flags;
- local_save_flags(flags); - tracing_generic_entry_update(entry, type, flags, pc); + tracing_generic_entry_update(entry, type, tracing_gen_ctx()); } NOKPROBE_SYMBOL(perf_trace_buf_update);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index ab3cb67b869e..546a535f1490 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -258,22 +258,19 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, trace_event_ignore_this_pid(trace_file)) return NULL;
- local_save_flags(fbuffer->flags); - fbuffer->pc = preempt_count(); /* * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables * preemption (adding one to the preempt_count). Since we are * interested in the preempt_count at the time the tracepoint was * hit, we need to subtract one to offset the increment. */ - if (IS_ENABLED(CONFIG_PREEMPTION)) - fbuffer->pc--; + fbuffer->trace_ctx = tracing_gen_ctx_dec(); fbuffer->trace_file = trace_file;
fbuffer->event = trace_event_buffer_lock_reserve(&fbuffer->buffer, trace_file, event_call->event.type, len, - fbuffer->flags, fbuffer->pc); + fbuffer->trace_ctx); if (!fbuffer->event) return NULL;
@@ -3679,12 +3676,11 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip, struct trace_buffer *buffer; struct ring_buffer_event *event; struct ftrace_entry *entry; - unsigned long flags; + unsigned int trace_ctx; long disabled; int cpu; - int pc;
- pc = preempt_count(); + trace_ctx = tracing_gen_ctx(); preempt_disable_notrace(); cpu = raw_smp_processor_id(); disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); @@ -3692,11 +3688,9 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip, if (disabled != 1) goto out;
- local_save_flags(flags); - event = trace_event_buffer_lock_reserve(&buffer, &event_trace_file, TRACE_FN, sizeof(*entry), - flags, pc); + trace_ctx); if (!event) goto out; entry = ring_buffer_event_data(event); @@ -3704,7 +3698,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip, entry->parent_ip = parent_ip;
event_trigger_unlock_commit(&event_trace_file, buffer, event, - entry, flags, pc); + entry, trace_ctx); out: atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); preempt_enable_notrace(); diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c index 22bcf7c51d1e..c188045c5f97 100644 --- a/kernel/trace/trace_events_inject.c +++ b/kernel/trace/trace_events_inject.c @@ -192,7 +192,6 @@ static void *trace_alloc_entry(struct trace_event_call *call, int *size) static int parse_entry(char *str, struct trace_event_call *call, void **pentry) { struct ftrace_event_field *field; - unsigned long irq_flags; void *entry = NULL; int entry_size; u64 val = 0; @@ -203,9 +202,8 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry) if (!entry) return -ENOMEM;
- local_save_flags(irq_flags); - tracing_generic_entry_update(entry, call->event.type, irq_flags, - preempt_count()); + tracing_generic_entry_update(entry, call->event.type, + tracing_gen_ctx());
while ((len = parse_field(str, call, &field, &val)) > 0) { if (is_function_field(field)) diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 93e20ed642e5..8606cb73341e 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -133,15 +133,14 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, { struct trace_array *tr = op->private; struct trace_array_cpu *data; - unsigned long flags; + unsigned int trace_ctx; int bit; int cpu; - int pc;
if (unlikely(!tr->function_enabled)) return;
- pc = preempt_count(); + trace_ctx = tracing_gen_ctx(); preempt_disable_notrace();
bit = trace_test_and_set_recursion(TRACE_FTRACE_START); @@ -150,10 +149,9 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
cpu = smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); - if (!atomic_read(&data->disabled)) { - local_save_flags(flags); - trace_function(tr, ip, parent_ip, flags, pc); - } + if (!atomic_read(&data->disabled)) + trace_function(tr, ip, parent_ip, trace_ctx); + trace_clear_recursion(bit);
out: @@ -187,7 +185,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, unsigned long flags; long disabled; int cpu; - int pc; + unsigned int trace_ctx;
if (unlikely(!tr->function_enabled)) return; @@ -202,9 +200,9 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1)) { - pc = preempt_count(); - trace_function(tr, ip, parent_ip, flags, pc); - __trace_stack(tr, flags, STACK_SKIP, pc); + trace_ctx = tracing_gen_ctx_flags(flags); + trace_function(tr, ip, parent_ip, trace_ctx); + __trace_stack(tr, trace_ctx, STACK_SKIP); }
atomic_dec(&data->disabled); @@ -407,13 +405,11 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip,
static __always_inline void trace_stack(struct trace_array *tr) { - unsigned long flags; - int pc; + unsigned int trace_ctx;
- local_save_flags(flags); - pc = preempt_count(); + trace_ctx = tracing_gen_ctx();
- __trace_stack(tr, flags, FTRACE_STACK_SKIP, pc); + __trace_stack(tr, trace_ctx, FTRACE_STACK_SKIP); }
static void diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 60d66278aa0d..b086ba8bb3d6 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -96,8 +96,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration,
int __trace_graph_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, - unsigned long flags, - int pc) + unsigned int trace_ctx) { struct trace_event_call *call = &event_funcgraph_entry; struct ring_buffer_event *event; @@ -105,7 +104,7 @@ int __trace_graph_entry(struct trace_array *tr, struct ftrace_graph_ent_entry *entry;
event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, - sizeof(*entry), flags, pc); + sizeof(*entry), trace_ctx); if (!event) return 0; entry = ring_buffer_event_data(event); @@ -129,10 +128,10 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) struct trace_array *tr = graph_array; struct trace_array_cpu *data; unsigned long flags; + unsigned int trace_ctx; long disabled; int ret; int cpu; - int pc;
if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) return 0; @@ -174,8 +173,8 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) data = per_cpu_ptr(tr->array_buffer.data, cpu); disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { - pc = preempt_count(); - ret = __trace_graph_entry(tr, trace, flags, pc); + trace_ctx = tracing_gen_ctx_flags(flags); + ret = __trace_graph_entry(tr, trace, trace_ctx); } else { ret = 0; } @@ -188,7 +187,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
static void __trace_graph_function(struct trace_array *tr, - unsigned long ip, unsigned long flags, int pc) + unsigned long ip, unsigned int trace_ctx) { u64 time = trace_clock_local(); struct ftrace_graph_ent ent = { @@ -202,22 +201,21 @@ __trace_graph_function(struct trace_array *tr, .rettime = time, };
- __trace_graph_entry(tr, &ent, flags, pc); - __trace_graph_return(tr, &ret, flags, pc); + __trace_graph_entry(tr, &ent, trace_ctx); + __trace_graph_return(tr, &ret, trace_ctx); }
void trace_graph_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, - unsigned long flags, int pc) + unsigned int trace_ctx) { - __trace_graph_function(tr, ip, flags, pc); + __trace_graph_function(tr, ip, trace_ctx); }
void __trace_graph_return(struct trace_array *tr, struct ftrace_graph_ret *trace, - unsigned long flags, - int pc) + unsigned int trace_ctx) { struct trace_event_call *call = &event_funcgraph_exit; struct ring_buffer_event *event; @@ -225,7 +223,7 @@ void __trace_graph_return(struct trace_array *tr, struct ftrace_graph_ret_entry *entry;
event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, - sizeof(*entry), flags, pc); + sizeof(*entry), trace_ctx); if (!event) return; entry = ring_buffer_event_data(event); @@ -239,9 +237,9 @@ void trace_graph_return(struct ftrace_graph_ret *trace) struct trace_array *tr = graph_array; struct trace_array_cpu *data; unsigned long flags; + unsigned int trace_ctx; long disabled; int cpu; - int pc;
ftrace_graph_addr_finish(trace);
@@ -255,8 +253,8 @@ void trace_graph_return(struct ftrace_graph_ret *trace) data = per_cpu_ptr(tr->array_buffer.data, cpu); disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { - pc = preempt_count(); - __trace_graph_return(tr, trace, flags, pc); + trace_ctx = tracing_gen_ctx_flags(flags); + __trace_graph_return(tr, trace, trace_ctx); } atomic_dec(&data->disabled); local_irq_restore(flags); diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index d071fc271eef..4c01c5d8b9a7 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -108,14 +108,9 @@ static void trace_hwlat_sample(struct hwlat_sample *sample) struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct hwlat_entry *entry; - unsigned long flags; - int pc; - - pc = preempt_count(); - local_save_flags(flags);
event = trace_buffer_lock_reserve(buffer, TRACE_HWLAT, sizeof(*entry), - flags, pc); + tracing_gen_ctx()); if (!event) return; entry = ring_buffer_event_data(event); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index ee4571b624bc..f11add83c108 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -143,11 +143,14 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; unsigned long flags; + unsigned int trace_ctx;
if (!func_prolog_dec(tr, &data, &flags)) return;
- trace_function(tr, ip, parent_ip, flags, preempt_count()); + trace_ctx = tracing_gen_ctx_flags(flags); + + trace_function(tr, ip, parent_ip, trace_ctx);
atomic_dec(&data->disabled); } @@ -177,8 +180,8 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; unsigned long flags; + unsigned int trace_ctx; int ret; - int pc;
if (ftrace_graph_ignore_func(trace)) return 0; @@ -195,8 +198,8 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) if (!func_prolog_dec(tr, &data, &flags)) return 0;
- pc = preempt_count(); - ret = __trace_graph_entry(tr, trace, flags, pc); + trace_ctx = tracing_gen_ctx_flags(flags); + ret = __trace_graph_entry(tr, trace, trace_ctx); atomic_dec(&data->disabled);
return ret; @@ -207,15 +210,15 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace) struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; unsigned long flags; - int pc; + unsigned int trace_ctx;
ftrace_graph_addr_finish(trace);
if (!func_prolog_dec(tr, &data, &flags)) return;
- pc = preempt_count(); - __trace_graph_return(tr, trace, flags, pc); + trace_ctx = tracing_gen_ctx_flags(flags); + __trace_graph_return(tr, trace, trace_ctx); atomic_dec(&data->disabled); }
@@ -267,12 +270,12 @@ static void irqsoff_print_header(struct seq_file *s) static void __trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, - unsigned long flags, int pc) + unsigned int trace_ctx) { if (is_graph(tr)) - trace_graph_function(tr, ip, parent_ip, flags, pc); + trace_graph_function(tr, ip, parent_ip, trace_ctx); else - trace_function(tr, ip, parent_ip, flags, pc); + trace_function(tr, ip, parent_ip, trace_ctx); }
#else @@ -322,15 +325,13 @@ check_critical_timing(struct trace_array *tr, { u64 T0, T1, delta; unsigned long flags; - int pc; + unsigned int trace_ctx;
T0 = data->preempt_timestamp; T1 = ftrace_now(cpu); delta = T1-T0;
- local_save_flags(flags); - - pc = preempt_count(); + trace_ctx = tracing_gen_ctx();
if (!report_latency(tr, delta)) goto out; @@ -341,9 +342,9 @@ check_critical_timing(struct trace_array *tr, if (!report_latency(tr, delta)) goto out_unlock;
- __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); + __trace_function(tr, CALLER_ADDR0, parent_ip, trace_ctx); /* Skip 5 functions to get to the irq/preempt enable function */ - __trace_stack(tr, flags, 5, pc); + __trace_stack(tr, trace_ctx, 5);
if (data->critical_sequence != max_sequence) goto out_unlock; @@ -363,16 +364,15 @@ check_critical_timing(struct trace_array *tr, out: data->critical_sequence = max_sequence; data->preempt_timestamp = ftrace_now(cpu); - __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); + __trace_function(tr, CALLER_ADDR0, parent_ip, trace_ctx); }
static nokprobe_inline void -start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) +start_critical_timing(unsigned long ip, unsigned long parent_ip) { int cpu; struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; - unsigned long flags;
if (!tracer_enabled || !tracing_is_enabled()) return; @@ -393,9 +393,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) data->preempt_timestamp = ftrace_now(cpu); data->critical_start = parent_ip ? : ip;
- local_save_flags(flags); - - __trace_function(tr, ip, parent_ip, flags, pc); + __trace_function(tr, ip, parent_ip, tracing_gen_ctx());
per_cpu(tracing_cpu, cpu) = 1;
@@ -403,12 +401,12 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) }
static nokprobe_inline void -stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) +stop_critical_timing(unsigned long ip, unsigned long parent_ip) { int cpu; struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; - unsigned long flags; + unsigned int trace_ctx;
cpu = raw_smp_processor_id(); /* Always clear the tracing cpu on stopping the trace */ @@ -428,8 +426,8 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
atomic_inc(&data->disabled);
- local_save_flags(flags); - __trace_function(tr, ip, parent_ip, flags, pc); + trace_ctx = tracing_gen_ctx(); + __trace_function(tr, ip, parent_ip, trace_ctx); check_critical_timing(tr, data, parent_ip ? : ip, cpu); data->critical_start = 0; atomic_dec(&data->disabled); @@ -438,20 +436,16 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) /* start and stop critical timings used to for stoppage (in idle) */ void start_critical_timings(void) { - int pc = preempt_count(); - - if (preempt_trace(pc) || irq_trace()) - start_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc); + if (preempt_trace(preempt_count()) || irq_trace()) + start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } EXPORT_SYMBOL_GPL(start_critical_timings); NOKPROBE_SYMBOL(start_critical_timings);
void stop_critical_timings(void) { - int pc = preempt_count(); - - if (preempt_trace(pc) || irq_trace()) - stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc); + if (preempt_trace(preempt_count()) || irq_trace()) + stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } EXPORT_SYMBOL_GPL(stop_critical_timings); NOKPROBE_SYMBOL(stop_critical_timings); @@ -613,19 +607,15 @@ static void irqsoff_tracer_stop(struct trace_array *tr) */ void tracer_hardirqs_on(unsigned long a0, unsigned long a1) { - unsigned int pc = preempt_count(); - - if (!preempt_trace(pc) && irq_trace()) - stop_critical_timing(a0, a1, pc); + if (!preempt_trace(preempt_count()) && irq_trace()) + stop_critical_timing(a0, a1); } NOKPROBE_SYMBOL(tracer_hardirqs_on);
void tracer_hardirqs_off(unsigned long a0, unsigned long a1) { - unsigned int pc = preempt_count(); - - if (!preempt_trace(pc) && irq_trace()) - start_critical_timing(a0, a1, pc); + if (!preempt_trace(preempt_count()) && irq_trace()) + start_critical_timing(a0, a1); } NOKPROBE_SYMBOL(tracer_hardirqs_off);
@@ -665,18 +655,14 @@ static struct tracer irqsoff_tracer __read_mostly = #ifdef CONFIG_PREEMPT_TRACER void tracer_preempt_on(unsigned long a0, unsigned long a1) { - int pc = preempt_count(); - - if (preempt_trace(pc) && !irq_trace()) - stop_critical_timing(a0, a1, pc); + if (preempt_trace(preempt_count()) && !irq_trace()) + stop_critical_timing(a0, a1); }
void tracer_preempt_off(unsigned long a0, unsigned long a1) { - int pc = preempt_count(); - - if (preempt_trace(pc) && !irq_trace()) - start_critical_timing(a0, a1, pc); + if (preempt_trace(preempt_count()) && !irq_trace()) + start_critical_timing(a0, a1); }
static int preemptoff_tracer_init(struct trace_array *tr) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 552dbc9d5226..11242ff53663 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1390,8 +1390,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, if (trace_trigger_soft_disabled(trace_file)) return;
- local_save_flags(fbuffer.flags); - fbuffer.pc = preempt_count(); + fbuffer.trace_ctx = tracing_gen_ctx(); fbuffer.trace_file = trace_file;
dsize = __get_data_size(&tk->tp, regs); @@ -1400,7 +1399,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, trace_event_buffer_lock_reserve(&fbuffer.buffer, trace_file, call->event.type, sizeof(*entry) + tk->tp.size + dsize, - fbuffer.flags, fbuffer.pc); + fbuffer.trace_ctx); if (!fbuffer.event) return;
@@ -1438,8 +1437,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, if (trace_trigger_soft_disabled(trace_file)) return;
- local_save_flags(fbuffer.flags); - fbuffer.pc = preempt_count(); + fbuffer.trace_ctx = tracing_gen_ctx(); fbuffer.trace_file = trace_file;
dsize = __get_data_size(&tk->tp, regs); @@ -1447,7 +1445,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, trace_event_buffer_lock_reserve(&fbuffer.buffer, trace_file, call->event.type, sizeof(*entry) + tk->tp.size + dsize, - fbuffer.flags, fbuffer.pc); + fbuffer.trace_ctx); if (!fbuffer.event) return;
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 84582bf1ed5f..7221ae0b4c47 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -300,10 +300,11 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct trace_mmiotrace_rw *entry; - int pc = preempt_count(); + unsigned int trace_ctx;
+ trace_ctx = tracing_gen_ctx_flags(0); event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_RW, - sizeof(*entry), 0, pc); + sizeof(*entry), trace_ctx); if (!event) { atomic_inc(&dropped_count); return; @@ -312,7 +313,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, entry->rw = *rw;
if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(tr, buffer, event, 0, pc); + trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); }
void mmio_trace_rw(struct mmiotrace_rw *rw) @@ -330,10 +331,11 @@ static void __trace_mmiotrace_map(struct trace_array *tr, struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct trace_mmiotrace_map *entry; - int pc = preempt_count(); + unsigned int trace_ctx;
+ trace_ctx = tracing_gen_ctx_flags(0); event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_MAP, - sizeof(*entry), 0, pc); + sizeof(*entry), trace_ctx); if (!event) { atomic_inc(&dropped_count); return; @@ -342,7 +344,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, entry->map = *map;
if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(tr, buffer, event, 0, pc); + trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); }
void mmio_trace_mapping(struct mmiotrace_map *map) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 97b10bb31a1f..f1c603358ff3 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -67,7 +67,7 @@ static bool function_enabled; static int func_prolog_preempt_disable(struct trace_array *tr, struct trace_array_cpu **data, - int *pc) + unsigned int *trace_ctx) { long disabled; int cpu; @@ -75,7 +75,7 @@ func_prolog_preempt_disable(struct trace_array *tr, if (likely(!wakeup_task)) return 0;
- *pc = preempt_count(); + *trace_ctx = tracing_gen_ctx(); preempt_disable_notrace();
cpu = raw_smp_processor_id(); @@ -116,8 +116,8 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace) { struct trace_array *tr = wakeup_trace; struct trace_array_cpu *data; - unsigned long flags; - int pc, ret = 0; + unsigned int trace_ctx; + int ret = 0;
if (ftrace_graph_ignore_func(trace)) return 0; @@ -131,11 +131,10 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace) if (ftrace_graph_notrace_addr(trace->func)) return 1;
- if (!func_prolog_preempt_disable(tr, &data, &pc)) + if (!func_prolog_preempt_disable(tr, &data, &trace_ctx)) return 0;
- local_save_flags(flags); - ret = __trace_graph_entry(tr, trace, flags, pc); + ret = __trace_graph_entry(tr, trace, trace_ctx); atomic_dec(&data->disabled); preempt_enable_notrace();
@@ -146,16 +145,14 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace) { struct trace_array *tr = wakeup_trace; struct trace_array_cpu *data; - unsigned long flags; - int pc; + unsigned int trace_ctx;
ftrace_graph_addr_finish(trace);
- if (!func_prolog_preempt_disable(tr, &data, &pc)) + if (!func_prolog_preempt_disable(tr, &data, &trace_ctx)) return;
- local_save_flags(flags); - __trace_graph_return(tr, trace, flags, pc); + __trace_graph_return(tr, trace, trace_ctx); atomic_dec(&data->disabled);
preempt_enable_notrace(); @@ -217,13 +214,13 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, struct trace_array *tr = wakeup_trace; struct trace_array_cpu *data; unsigned long flags; - int pc; + unsigned int trace_ctx;
- if (!func_prolog_preempt_disable(tr, &data, &pc)) + if (!func_prolog_preempt_disable(tr, &data, &trace_ctx)) return;
local_irq_save(flags); - trace_function(tr, ip, parent_ip, flags, pc); + trace_function(tr, ip, parent_ip, trace_ctx); local_irq_restore(flags);
atomic_dec(&data->disabled); @@ -303,12 +300,12 @@ static void wakeup_print_header(struct seq_file *s) static void __trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, - unsigned long flags, int pc) + unsigned int trace_ctx) { if (is_graph(tr)) - trace_graph_function(tr, ip, parent_ip, flags, pc); + trace_graph_function(tr, ip, parent_ip, trace_ctx); else - trace_function(tr, ip, parent_ip, flags, pc); + trace_function(tr, ip, parent_ip, trace_ctx); }
static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) @@ -375,7 +372,7 @@ static void tracing_sched_switch_trace(struct trace_array *tr, struct task_struct *prev, struct task_struct *next, - unsigned long flags, int pc) + unsigned int trace_ctx) { struct trace_event_call *call = &event_context_switch; struct trace_buffer *buffer = tr->array_buffer.buffer; @@ -383,7 +380,7 @@ tracing_sched_switch_trace(struct trace_array *tr, struct ctx_switch_entry *entry;
event = trace_buffer_lock_reserve(buffer, TRACE_CTX, - sizeof(*entry), flags, pc); + sizeof(*entry), trace_ctx); if (!event) return; entry = ring_buffer_event_data(event); @@ -396,14 +393,14 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->next_cpu = task_cpu(next);
if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(tr, buffer, event, flags, pc); + trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); }
static void tracing_sched_wakeup_trace(struct trace_array *tr, struct task_struct *wakee, struct task_struct *curr, - unsigned long flags, int pc) + unsigned int trace_ctx) { struct trace_event_call *call = &event_wakeup; struct ring_buffer_event *event; @@ -411,7 +408,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, struct trace_buffer *buffer = tr->array_buffer.buffer;
event = trace_buffer_lock_reserve(buffer, TRACE_WAKE, - sizeof(*entry), flags, pc); + sizeof(*entry), trace_ctx); if (!event) return; entry = ring_buffer_event_data(event); @@ -424,7 +421,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_cpu = task_cpu(wakee);
if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(tr, buffer, event, flags, pc); + trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); }
static void notrace @@ -436,7 +433,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt, unsigned long flags; long disabled; int cpu; - int pc; + unsigned int trace_ctx;
tracing_record_cmdline(prev);
@@ -455,8 +452,6 @@ probe_wakeup_sched_switch(void *ignore, bool preempt, if (next != wakeup_task) return;
- pc = preempt_count(); - /* disable local data, not wakeup_cpu data */ cpu = raw_smp_processor_id(); disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); @@ -464,6 +459,8 @@ probe_wakeup_sched_switch(void *ignore, bool preempt, goto out;
local_irq_save(flags); + trace_ctx = tracing_gen_ctx_flags(flags); + arch_spin_lock(&wakeup_lock);
/* We could race with grabbing wakeup_lock */ @@ -473,9 +470,9 @@ probe_wakeup_sched_switch(void *ignore, bool preempt, /* The task we are waiting for is waking up */ data = per_cpu_ptr(wakeup_trace->array_buffer.data, wakeup_cpu);
- __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); - tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); - __trace_stack(wakeup_trace, flags, 0, pc); + __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, trace_ctx); + tracing_sched_switch_trace(wakeup_trace, prev, next, trace_ctx); + __trace_stack(wakeup_trace, trace_ctx, 0);
T0 = data->preempt_timestamp; T1 = ftrace_now(cpu); @@ -527,9 +524,8 @@ probe_wakeup(void *ignore, struct task_struct *p) { struct trace_array_cpu *data; int cpu = smp_processor_id(); - unsigned long flags; long disabled; - int pc; + unsigned int trace_ctx;
if (likely(!tracer_enabled)) return; @@ -550,11 +546,12 @@ probe_wakeup(void *ignore, struct task_struct *p) (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio))) return;
- pc = preempt_count(); disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); if (unlikely(disabled != 1)) goto out;
+ trace_ctx = tracing_gen_ctx(); + /* interrupts should be off from try_to_wake_up */ arch_spin_lock(&wakeup_lock);
@@ -581,19 +578,17 @@ probe_wakeup(void *ignore, struct task_struct *p)
wakeup_task = get_task_struct(p);
- local_save_flags(flags); - data = per_cpu_ptr(wakeup_trace->array_buffer.data, wakeup_cpu); data->preempt_timestamp = ftrace_now(cpu); - tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc); - __trace_stack(wakeup_trace, flags, 0, pc); + tracing_sched_wakeup_trace(wakeup_trace, p, current, trace_ctx); + __trace_stack(wakeup_trace, trace_ctx, 0);
/* * We must be careful in using CALLER_ADDR2. But since wake_up * is not called by an assembly function (where as schedule is) * it should be safe to use it here. */ - __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); + __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, trace_ctx);
out_locked: arch_spin_unlock(&wakeup_lock); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index d85a2f0f316b..8bfcd3b09422 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -298,9 +298,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) struct syscall_metadata *sys_data; struct ring_buffer_event *event; struct trace_buffer *buffer; - unsigned long irq_flags; + unsigned int trace_ctx; unsigned long args[6]; - int pc; int syscall_nr; int size;
@@ -322,12 +321,11 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
- local_save_flags(irq_flags); - pc = preempt_count(); + trace_ctx = tracing_gen_ctx();
buffer = tr->array_buffer.buffer; event = trace_buffer_lock_reserve(buffer, - sys_data->enter_event->event.type, size, irq_flags, pc); + sys_data->enter_event->event.type, size, trace_ctx); if (!event) return;
@@ -337,7 +335,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args);
event_trigger_unlock_commit(trace_file, buffer, event, entry, - irq_flags, pc); + trace_ctx); }
static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) @@ -348,8 +346,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) struct syscall_metadata *sys_data; struct ring_buffer_event *event; struct trace_buffer *buffer; - unsigned long irq_flags; - int pc; + unsigned int trace_ctx; int syscall_nr;
syscall_nr = trace_get_syscall_nr(current, regs); @@ -368,13 +365,12 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) if (!sys_data) return;
- local_save_flags(irq_flags); - pc = preempt_count(); + trace_ctx = tracing_gen_ctx();
buffer = tr->array_buffer.buffer; event = trace_buffer_lock_reserve(buffer, sys_data->exit_event->event.type, sizeof(*entry), - irq_flags, pc); + trace_ctx); if (!event) return;
@@ -383,7 +379,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) entry->ret = syscall_get_return_value(current, regs);
event_trigger_unlock_commit(trace_file, buffer, event, entry, - irq_flags, pc); + trace_ctx); }
static int reg_event_syscall_enter(struct trace_event_file *file, diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 0dd6e286e519..a1bc68de1b29 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -965,7 +965,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); size = esize + tu->tp.size + dsize; event = trace_event_buffer_lock_reserve(&buffer, trace_file, - call->event.type, size, 0, 0); + call->event.type, size, 0); if (!event) return;
@@ -981,7 +981,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
memcpy(data, ucb->buf, tu->tp.size + dsize);
- event_trigger_unlock_commit(trace_file, buffer, event, entry, 0, 0); + event_trigger_unlock_commit(trace_file, buffer, event, entry, 0); }
/* uprobe handler */
From: Daniel Bristot de Oliveira bristot@redhat.com
mainline inclusion from mainline-v5.14-rc1 commit bce29ac9ce0bb0b0b146b687ab978378c21e9078 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4G64B CVE: NA
-------------------------------------------------
In the context of high-performance computing (HPC), the Operating System Noise (*osnoise*) refers to the interference experienced by an application due to activities inside the operating system. In the context of Linux, NMIs, IRQs, SoftIRQs, and any other system thread can cause noise to the system. Moreover, hardware-related jobs can also cause noise, for example, via SMIs.
The osnoise tracer leverages the hwlat_detector by running a similar loop with preemption, SoftIRQs and IRQs enabled, thus allowing all the sources of *osnoise* during its execution. Using the same approach of hwlat, osnoise takes note of the entry and exit point of any source of interferences, increasing a per-cpu interference counter. The osnoise tracer also saves an interference counter for each source of interference. The interference counter for NMI, IRQs, SoftIRQs, and threads is increased anytime the tool observes these interferences' entry events. When a noise happens without any interference from the operating system level, the hardware noise counter increases, pointing to a hardware-related noise. In this way, osnoise can account for any source of interference. At the end of the period, the osnoise tracer prints the sum of all noise, the max single noise, the percentage of CPU available for the thread, and the counters for the noise sources.
Usage
Write the ASCII text "osnoise" into the current_tracer file of the tracing system (generally mounted at /sys/kernel/tracing).
For example::
[root@f32 ~]# cd /sys/kernel/tracing/ [root@f32 tracing]# echo osnoise > current_tracer
It is possible to follow the trace by reading the trace trace file::
[root@f32 tracing]# cat trace # tracer: osnoise # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth MAX # || / SINGLE Interference counters: # |||| RUNTIME NOISE % OF CPU NOISE +-----------------------------+ # TASK-PID CPU# |||| TIMESTAMP IN US IN US AVAILABLE IN US HW NMI IRQ SIRQ THREAD # | | | |||| | | | | | | | | | | <...>-859 [000] .... 81.637220: 1000000 190 99.98100 9 18 0 1007 18 1 <...>-860 [001] .... 81.638154: 1000000 656 99.93440 74 23 0 1006 16 3 <...>-861 [002] .... 81.638193: 1000000 5675 99.43250 202 6 0 1013 25 21 <...>-862 [003] .... 81.638242: 1000000 125 99.98750 45 1 0 1011 23 0 <...>-863 [004] .... 81.638260: 1000000 1721 99.82790 168 7 0 1002 49 41 <...>-864 [005] .... 81.638286: 1000000 263 99.97370 57 6 0 1006 26 2 <...>-865 [006] .... 81.638302: 1000000 109 99.98910 21 3 0 1006 18 1 <...>-866 [007] .... 81.638326: 1000000 7816 99.21840 107 8 0 1016 39 19
In addition to the regular trace fields (from TASK-PID to TIMESTAMP), the tracer prints a message at the end of each period for each CPU that is running an osnoise/CPU thread. The osnoise specific fields report:
- The RUNTIME IN USE reports the amount of time in microseconds that the osnoise thread kept looping reading the time. - The NOISE IN US reports the sum of noise in microseconds observed by the osnoise tracer during the associated runtime. - The % OF CPU AVAILABLE reports the percentage of CPU available for the osnoise thread during the runtime window. - The MAX SINGLE NOISE IN US reports the maximum single noise observed during the runtime window. - The Interference counters display how many each of the respective interference happened during the runtime window.
Note that the example above shows a high number of HW noise samples. The reason being is that this sample was taken on a virtual machine, and the host interference is detected as a hardware interference.
Tracer options
The tracer has a set of options inside the osnoise directory, they are:
- osnoise/cpus: CPUs at which a osnoise thread will execute. - osnoise/period_us: the period of the osnoise thread. - osnoise/runtime_us: how long an osnoise thread will look for noise. - osnoise/stop_tracing_us: stop the system tracing if a single noise higher than the configured value happens. Writing 0 disables this option. - osnoise/stop_tracing_total_us: stop the system tracing if total noise higher than the configured value happens. Writing 0 disables this option. - tracing_threshold: the minimum delta between two time() reads to be considered as noise, in us. When set to 0, the default value will be used, which is currently 5 us.
Additional Tracing
In addition to the tracer, a set of tracepoints were added to facilitate the identification of the osnoise source.
- osnoise:sample_threshold: printed anytime a noise is higher than the configurable tolerance_ns. - osnoise:nmi_noise: noise from NMI, including the duration. - osnoise:irq_noise: noise from an IRQ, including the duration. - osnoise:softirq_noise: noise from a SoftIRQ, including the duration. - osnoise:thread_noise: noise from a thread, including the duration.
Note that all the values are *net values*. For example, if while osnoise is running, another thread preempts the osnoise thread, it will start a thread_noise duration at the start. Then, an IRQ takes place, preempting the thread_noise, starting a irq_noise. When the IRQ ends its execution, it will compute its duration, and this duration will be subtracted from the thread_noise, in such a way as to avoid the double accounting of the IRQ execution. This logic is valid for all sources of noise.
Here is one example of the usage of these tracepoints::
osnoise/8-961 [008] d.h. 5789.857532: irq_noise: local_timer:236 start 5789.857529929 duration 1845 ns osnoise/8-961 [008] dNh. 5789.858408: irq_noise: local_timer:236 start 5789.858404871 duration 2848 ns migration/8-54 [008] d... 5789.858413: thread_noise: migration/8:54 start 5789.858409300 duration 3068 ns osnoise/8-961 [008] .... 5789.858413: sample_threshold: start 5789.858404555 duration 8723 ns interferences 2
In this example, a noise sample of 8 microseconds was reported in the last line, pointing to two interferences. Looking backward in the trace, the two previous entries were about the migration thread running after a timer IRQ execution. The first event is not part of the noise because it took place one millisecond before.
It is worth noticing that the sum of the duration reported in the tracepoints is smaller than eight us reported in the sample_threshold. The reason roots in the overhead of the entry and exit code that happens before and after any interference execution. This justifies the dual approach: measuring thread and tracing.
Link: https://lkml.kernel.org/r/e649467042d60e7b62714c9c6751a56299d15119.162437231...
Cc: Phil Auld pauld@redhat.com Cc: Sebastian Andrzej Siewior bigeasy@linutronix.de Cc: Kate Carcia kcarcia@redhat.com Cc: Jonathan Corbet corbet@lwn.net Cc: Ingo Molnar mingo@redhat.com Cc: Peter Zijlstra peterz@infradead.org Cc: Thomas Gleixner tglx@linutronix.de Cc: Alexandre Chartre alexandre.chartre@oracle.com Cc: Clark Willaims williams@redhat.com Cc: John Kacur jkacur@redhat.com Cc: Juri Lelli juri.lelli@redhat.com Cc: Borislav Petkov bp@alien8.de Cc: "H. Peter Anvin" hpa@zytor.com Cc: x86@kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira bristot@redhat.com [ Made the following functions static: trace_irqentry_callback() trace_irqexit_callback() trace_intel_irqentry_callback() trace_intel_irqexit_callback()
Added to include/trace.h: osnoise_arch_register() osnoise_arch_unregister()
Fixed define logic for LATENCY_FS_NOTIFY
Reported-by: kernel test robot lkp@intel.com ] Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- Documentation/trace/index.rst | 1 + Documentation/trace/osnoise-tracer.rst | 152 +++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/trace.c | 237 ++++ include/linux/ftrace_irq.h | 13 + include/linux/trace.h | 5 + include/trace/events/osnoise.h | 142 +++ kernel/trace/Kconfig | 34 + kernel/trace/Makefile | 1 + kernel/trace/trace.h | 9 +- kernel/trace/trace_entries.h | 25 + kernel/trace/trace_osnoise.c | 1384 ++++++++++++++++++++++++ kernel/trace/trace_output.c | 72 +- 13 files changed, 2072 insertions(+), 4 deletions(-) create mode 100644 Documentation/trace/osnoise-tracer.rst create mode 100644 arch/x86/kernel/trace.c create mode 100644 include/trace/events/osnoise.h create mode 100644 kernel/trace/trace_osnoise.c
diff --git a/Documentation/trace/index.rst b/Documentation/trace/index.rst index f634b36fd3aa..608107b27cc0 100644 --- a/Documentation/trace/index.rst +++ b/Documentation/trace/index.rst @@ -23,6 +23,7 @@ Linux Tracing Technologies histogram-design boottime-trace hwlat_detector + osnoise-tracer intel_th ring-buffer-design stm diff --git a/Documentation/trace/osnoise-tracer.rst b/Documentation/trace/osnoise-tracer.rst new file mode 100644 index 000000000000..37a3c10fb216 --- /dev/null +++ b/Documentation/trace/osnoise-tracer.rst @@ -0,0 +1,152 @@ +============== +OSNOISE Tracer +============== + +In the context of high-performance computing (HPC), the Operating System +Noise (*osnoise*) refers to the interference experienced by an application +due to activities inside the operating system. In the context of Linux, +NMIs, IRQs, SoftIRQs, and any other system thread can cause noise to the +system. Moreover, hardware-related jobs can also cause noise, for example, +via SMIs. + +hwlat_detector is one of the tools used to identify the most complex +source of noise: *hardware noise*. + +In a nutshell, the hwlat_detector creates a thread that runs +periodically for a given period. At the beginning of a period, the thread +disables interrupt and starts sampling. While running, the hwlatd +thread reads the time in a loop. As interrupts are disabled, threads, +IRQs, and SoftIRQs cannot interfere with the hwlatd thread. Hence, the +cause of any gap between two different reads of the time roots either on +NMI or in the hardware itself. At the end of the period, hwlatd enables +interrupts and reports the max observed gap between the reads. It also +prints a NMI occurrence counter. If the output does not report NMI +executions, the user can conclude that the hardware is the culprit for +the latency. The hwlat detects the NMI execution by observing +the entry and exit of a NMI. + +The osnoise tracer leverages the hwlat_detector by running a +similar loop with preemption, SoftIRQs and IRQs enabled, thus allowing +all the sources of *osnoise* during its execution. Using the same approach +of hwlat, osnoise takes note of the entry and exit point of any +source of interferences, increasing a per-cpu interference counter. The +osnoise tracer also saves an interference counter for each source of +interference. The interference counter for NMI, IRQs, SoftIRQs, and +threads is increased anytime the tool observes these interferences' entry +events. When a noise happens without any interference from the operating +system level, the hardware noise counter increases, pointing to a +hardware-related noise. In this way, osnoise can account for any +source of interference. At the end of the period, the osnoise tracer +prints the sum of all noise, the max single noise, the percentage of CPU +available for the thread, and the counters for the noise sources. + +Usage +----- + +Write the ASCII text "osnoise" into the current_tracer file of the +tracing system (generally mounted at /sys/kernel/tracing). + +For example:: + + [root@f32 ~]# cd /sys/kernel/tracing/ + [root@f32 tracing]# echo osnoise > current_tracer + +It is possible to follow the trace by reading the trace trace file:: + + [root@f32 tracing]# cat trace + # tracer: osnoise + # + # _-----=> irqs-off + # / _----=> need-resched + # | / _---=> hardirq/softirq + # || / _--=> preempt-depth MAX + # || / SINGLE Interference counters: + # |||| RUNTIME NOISE % OF CPU NOISE +-----------------------------+ + # TASK-PID CPU# |||| TIMESTAMP IN US IN US AVAILABLE IN US HW NMI IRQ SIRQ THREAD + # | | | |||| | | | | | | | | | | + <...>-859 [000] .... 81.637220: 1000000 190 99.98100 9 18 0 1007 18 1 + <...>-860 [001] .... 81.638154: 1000000 656 99.93440 74 23 0 1006 16 3 + <...>-861 [002] .... 81.638193: 1000000 5675 99.43250 202 6 0 1013 25 21 + <...>-862 [003] .... 81.638242: 1000000 125 99.98750 45 1 0 1011 23 0 + <...>-863 [004] .... 81.638260: 1000000 1721 99.82790 168 7 0 1002 49 41 + <...>-864 [005] .... 81.638286: 1000000 263 99.97370 57 6 0 1006 26 2 + <...>-865 [006] .... 81.638302: 1000000 109 99.98910 21 3 0 1006 18 1 + <...>-866 [007] .... 81.638326: 1000000 7816 99.21840 107 8 0 1016 39 19 + +In addition to the regular trace fields (from TASK-PID to TIMESTAMP), the +tracer prints a message at the end of each period for each CPU that is +running an osnoise/ thread. The osnoise specific fields report: + + - The RUNTIME IN USE reports the amount of time in microseconds that + the osnoise thread kept looping reading the time. + - The NOISE IN US reports the sum of noise in microseconds observed + by the osnoise tracer during the associated runtime. + - The % OF CPU AVAILABLE reports the percentage of CPU available for + the osnoise thread during the runtime window. + - The MAX SINGLE NOISE IN US reports the maximum single noise observed + during the runtime window. + - The Interference counters display how many each of the respective + interference happened during the runtime window. + +Note that the example above shows a high number of HW noise samples. +The reason being is that this sample was taken on a virtual machine, +and the host interference is detected as a hardware interference. + +Tracer options +--------------------- + +The tracer has a set of options inside the osnoise directory, they are: + + - osnoise/cpus: CPUs at which a osnoise thread will execute. + - osnoise/period_us: the period of the osnoise thread. + - osnoise/runtime_us: how long an osnoise thread will look for noise. + - osnoise/stop_tracing_us: stop the system tracing if a single noise + higher than the configured value happens. Writing 0 disables this + option. + - osnoise/stop_tracing_total_us: stop the system tracing if total noise + higher than the configured value happens. Writing 0 disables this + option. + - tracing_threshold: the minimum delta between two time() reads to be + considered as noise, in us. When set to 0, the default value will + will be used, which is currently 5 us. + +Additional Tracing +------------------ + +In addition to the tracer, a set of tracepoints were added to +facilitate the identification of the osnoise source. + + - osnoise:sample_threshold: printed anytime a noise is higher than + the configurable tolerance_ns. + - osnoise:nmi_noise: noise from NMI, including the duration. + - osnoise:irq_noise: noise from an IRQ, including the duration. + - osnoise:softirq_noise: noise from a SoftIRQ, including the + duration. + - osnoise:thread_noise: noise from a thread, including the duration. + +Note that all the values are *net values*. For example, if while osnoise +is running, another thread preempts the osnoise thread, it will start a +thread_noise duration at the start. Then, an IRQ takes place, preempting +the thread_noise, starting a irq_noise. When the IRQ ends its execution, +it will compute its duration, and this duration will be subtracted from +the thread_noise, in such a way as to avoid the double accounting of the +IRQ execution. This logic is valid for all sources of noise. + +Here is one example of the usage of these tracepoints:: + + osnoise/8-961 [008] d.h. 5789.857532: irq_noise: local_timer:236 start 5789.857529929 duration 1845 ns + osnoise/8-961 [008] dNh. 5789.858408: irq_noise: local_timer:236 start 5789.858404871 duration 2848 ns + migration/8-54 [008] d... 5789.858413: thread_noise: migration/8:54 start 5789.858409300 duration 3068 ns + osnoise/8-961 [008] .... 5789.858413: sample_threshold: start 5789.858404555 duration 8812 ns interferences 2 + +In this example, a noise sample of 8 microseconds was reported in the last +line, pointing to two interferences. Looking backward in the trace, the +two previous entries were about the migration thread running after a +timer IRQ execution. The first event is not part of the noise because +it took place one millisecond before. + +It is worth noticing that the sum of the duration reported in the +tracepoints is smaller than eight us reported in the sample_threshold. +The reason roots in the overhead of the entry and exit code that happens +before and after any interference execution. This justifies the dual +approach: measuring thread and tracing. diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index cbed6c1f3fa5..f0606f816aa8 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -107,6 +107,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += ftrace_$(BITS).o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_X86_TSC) += trace_clock.o +obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_CRASH_CORE) += crash_core_$(BITS).o obj-$(CONFIG_KEXEC_CORE) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o crash.o diff --git a/arch/x86/kernel/trace.c b/arch/x86/kernel/trace.c new file mode 100644 index 000000000000..6912672c33a7 --- /dev/null +++ b/arch/x86/kernel/trace.c @@ -0,0 +1,237 @@ +#include <asm/trace/irq_vectors.h> +#include <linux/trace.h> + +#if defined(CONFIG_OSNOISE_TRACER) && defined(CONFIG_X86_LOCAL_APIC) +extern void osnoise_trace_irq_entry(int id); +extern void osnoise_trace_irq_exit(int id, const char *desc); + +/* + * trace_intel_irq_entry - record intel specific IRQ entry + */ +static void trace_intel_irq_entry(void *data, int vector) +{ + osnoise_trace_irq_entry(vector); +} + +/* + * trace_intel_irq_exit - record intel specific IRQ exit + */ +static void trace_intel_irq_exit(void *data, int vector) +{ + char *vector_desc = (char *) data; + + osnoise_trace_irq_exit(vector, vector_desc); +} + +/* + * register_intel_irq_tp - Register intel specific IRQ entry tracepoints + */ +int osnoise_arch_register(void) +{ + int ret; + + ret = register_trace_local_timer_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_err; + + ret = register_trace_local_timer_exit(trace_intel_irq_exit, "local_timer"); + if (ret) + goto out_timer_entry; + +#ifdef CONFIG_X86_THERMAL_VECTOR + ret = register_trace_thermal_apic_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_timer_exit; + + ret = register_trace_thermal_apic_exit(trace_intel_irq_exit, "thermal_apic"); + if (ret) + goto out_thermal_entry; +#endif /* CONFIG_X86_THERMAL_VECTOR */ + +#ifdef CONFIG_X86_MCE_AMD + ret = register_trace_deferred_error_apic_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_thermal_exit; + + ret = register_trace_deferred_error_apic_exit(trace_intel_irq_exit, "deferred_error"); + if (ret) + goto out_deferred_entry; +#endif + +#ifdef CONFIG_X86_MCE_THRESHOLD + ret = register_trace_threshold_apic_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_deferred_exit; + + ret = register_trace_threshold_apic_exit(trace_intel_irq_exit, "threshold_apic"); + if (ret) + goto out_threshold_entry; +#endif /* CONFIG_X86_MCE_THRESHOLD */ + +#ifdef CONFIG_SMP + ret = register_trace_call_function_single_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_threshold_exit; + + ret = register_trace_call_function_single_exit(trace_intel_irq_exit, + "call_function_single"); + if (ret) + goto out_call_function_single_entry; + + ret = register_trace_call_function_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_call_function_single_exit; + + ret = register_trace_call_function_exit(trace_intel_irq_exit, "call_function"); + if (ret) + goto out_call_function_entry; + + ret = register_trace_reschedule_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_call_function_exit; + + ret = register_trace_reschedule_exit(trace_intel_irq_exit, "reschedule"); + if (ret) + goto out_reschedule_entry; +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_IRQ_WORK + ret = register_trace_irq_work_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_reschedule_exit; + + ret = register_trace_irq_work_exit(trace_intel_irq_exit, "irq_work"); + if (ret) + goto out_irq_work_entry; +#endif + + ret = register_trace_x86_platform_ipi_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_irq_work_exit; + + ret = register_trace_x86_platform_ipi_exit(trace_intel_irq_exit, "x86_platform_ipi"); + if (ret) + goto out_x86_ipi_entry; + + ret = register_trace_error_apic_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_x86_ipi_exit; + + ret = register_trace_error_apic_exit(trace_intel_irq_exit, "error_apic"); + if (ret) + goto out_error_apic_entry; + + ret = register_trace_spurious_apic_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_error_apic_exit; + + ret = register_trace_spurious_apic_exit(trace_intel_irq_exit, "spurious_apic"); + if (ret) + goto out_spurious_apic_entry; + + return 0; + +out_spurious_apic_entry: + unregister_trace_spurious_apic_entry(trace_intel_irq_entry, NULL); +out_error_apic_exit: + unregister_trace_error_apic_exit(trace_intel_irq_exit, "error_apic"); +out_error_apic_entry: + unregister_trace_error_apic_entry(trace_intel_irq_entry, NULL); +out_x86_ipi_exit: + unregister_trace_x86_platform_ipi_exit(trace_intel_irq_exit, "x86_platform_ipi"); +out_x86_ipi_entry: + unregister_trace_x86_platform_ipi_entry(trace_intel_irq_entry, NULL); +out_irq_work_exit: + +#ifdef CONFIG_IRQ_WORK + unregister_trace_irq_work_exit(trace_intel_irq_exit, "irq_work"); +out_irq_work_entry: + unregister_trace_irq_work_entry(trace_intel_irq_entry, NULL); +out_reschedule_exit: +#endif + +#ifdef CONFIG_SMP + unregister_trace_reschedule_exit(trace_intel_irq_exit, "reschedule"); +out_reschedule_entry: + unregister_trace_reschedule_entry(trace_intel_irq_entry, NULL); +out_call_function_exit: + unregister_trace_call_function_exit(trace_intel_irq_exit, "call_function"); +out_call_function_entry: + unregister_trace_call_function_entry(trace_intel_irq_entry, NULL); +out_call_function_single_exit: + unregister_trace_call_function_single_exit(trace_intel_irq_exit, "call_function_single"); +out_call_function_single_entry: + unregister_trace_call_function_single_entry(trace_intel_irq_entry, NULL); +out_threshold_exit: +#endif + +#ifdef CONFIG_X86_MCE_THRESHOLD + unregister_trace_threshold_apic_exit(trace_intel_irq_exit, "threshold_apic"); +out_threshold_entry: + unregister_trace_threshold_apic_entry(trace_intel_irq_entry, NULL); +out_deferred_exit: +#endif + +#ifdef CONFIG_X86_MCE_AMD + unregister_trace_deferred_error_apic_exit(trace_intel_irq_exit, "deferred_error"); +out_deferred_entry: + unregister_trace_deferred_error_apic_entry(trace_intel_irq_entry, NULL); +out_thermal_exit: +#endif /* CONFIG_X86_MCE_AMD */ + +#ifdef CONFIG_X86_THERMAL_VECTOR + unregister_trace_thermal_apic_exit(trace_intel_irq_exit, "thermal_apic"); +out_thermal_entry: + unregister_trace_thermal_apic_entry(trace_intel_irq_entry, NULL); +out_timer_exit: +#endif /* CONFIG_X86_THERMAL_VECTOR */ + + unregister_trace_local_timer_exit(trace_intel_irq_exit, "local_timer"); +out_timer_entry: + unregister_trace_local_timer_entry(trace_intel_irq_entry, NULL); +out_err: + return -EINVAL; +} + +void osnoise_arch_unregister(void) +{ + unregister_trace_spurious_apic_exit(trace_intel_irq_exit, "spurious_apic"); + unregister_trace_spurious_apic_entry(trace_intel_irq_entry, NULL); + unregister_trace_error_apic_exit(trace_intel_irq_exit, "error_apic"); + unregister_trace_error_apic_entry(trace_intel_irq_entry, NULL); + unregister_trace_x86_platform_ipi_exit(trace_intel_irq_exit, "x86_platform_ipi"); + unregister_trace_x86_platform_ipi_entry(trace_intel_irq_entry, NULL); + +#ifdef CONFIG_IRQ_WORK + unregister_trace_irq_work_exit(trace_intel_irq_exit, "irq_work"); + unregister_trace_irq_work_entry(trace_intel_irq_entry, NULL); +#endif + +#ifdef CONFIG_SMP + unregister_trace_reschedule_exit(trace_intel_irq_exit, "reschedule"); + unregister_trace_reschedule_entry(trace_intel_irq_entry, NULL); + unregister_trace_call_function_exit(trace_intel_irq_exit, "call_function"); + unregister_trace_call_function_entry(trace_intel_irq_entry, NULL); + unregister_trace_call_function_single_exit(trace_intel_irq_exit, "call_function_single"); + unregister_trace_call_function_single_entry(trace_intel_irq_entry, NULL); +#endif + +#ifdef CONFIG_X86_MCE_THRESHOLD + unregister_trace_threshold_apic_exit(trace_intel_irq_exit, "threshold_apic"); + unregister_trace_threshold_apic_entry(trace_intel_irq_entry, NULL); +#endif + +#ifdef CONFIG_X86_MCE_AMD + unregister_trace_deferred_error_apic_exit(trace_intel_irq_exit, "deferred_error"); + unregister_trace_deferred_error_apic_entry(trace_intel_irq_entry, NULL); +#endif + +#ifdef CONFIG_X86_THERMAL_VECTOR + unregister_trace_thermal_apic_exit(trace_intel_irq_exit, "thermal_apic"); + unregister_trace_thermal_apic_entry(trace_intel_irq_entry, NULL); +#endif /* CONFIG_X86_THERMAL_VECTOR */ + + unregister_trace_local_timer_exit(trace_intel_irq_exit, "local_timer"); + unregister_trace_local_timer_entry(trace_intel_irq_entry, NULL); +} +#endif /* CONFIG_OSNOISE_TRAECR && CONFIG_X86_LOCAL_APIC */ diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h index 0abd9a1d2852..f6faa31289ba 100644 --- a/include/linux/ftrace_irq.h +++ b/include/linux/ftrace_irq.h @@ -7,12 +7,21 @@ extern bool trace_hwlat_callback_enabled; extern void trace_hwlat_callback(bool enter); #endif
+#ifdef CONFIG_OSNOISE_TRACER +extern bool trace_osnoise_callback_enabled; +extern void trace_osnoise_callback(bool enter); +#endif + static inline void ftrace_nmi_enter(void) { #ifdef CONFIG_HWLAT_TRACER if (trace_hwlat_callback_enabled) trace_hwlat_callback(true); #endif +#ifdef CONFIG_OSNOISE_TRACER + if (trace_osnoise_callback_enabled) + trace_osnoise_callback(true); +#endif }
static inline void ftrace_nmi_exit(void) @@ -21,6 +30,10 @@ static inline void ftrace_nmi_exit(void) if (trace_hwlat_callback_enabled) trace_hwlat_callback(false); #endif +#ifdef CONFIG_OSNOISE_TRACER + if (trace_osnoise_callback_enabled) + trace_osnoise_callback(false); +#endif }
#endif /* _LINUX_FTRACE_IRQ_H */ diff --git a/include/linux/trace.h b/include/linux/trace.h index 886a4ffd9d45..148c6b7e0ce6 100644 --- a/include/linux/trace.h +++ b/include/linux/trace.h @@ -40,6 +40,11 @@ int trace_array_init_printk(struct trace_array *tr); void trace_array_put(struct trace_array *tr); struct trace_array *trace_array_get_by_name(const char *name); int trace_array_destroy(struct trace_array *tr); + +/* For osnoise tracer */ +int osnoise_arch_register(void); +void osnoise_arch_unregister(void); + #endif /* CONFIG_TRACING */
#endif /* _LINUX_TRACE_H */ diff --git a/include/trace/events/osnoise.h b/include/trace/events/osnoise.h new file mode 100644 index 000000000000..28762c69f6c9 --- /dev/null +++ b/include/trace/events/osnoise.h @@ -0,0 +1,142 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM osnoise + +#if !defined(_OSNOISE_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _OSNOISE_TRACE_H + +#include <linux/tracepoint.h> +TRACE_EVENT(thread_noise, + + TP_PROTO(struct task_struct *t, u64 start, u64 duration), + + TP_ARGS(t, start, duration), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN) + __field( u64, start ) + __field( u64, duration) + __field( pid_t, pid ) + ), + + TP_fast_assign( + memcpy(__entry->comm, t->comm, TASK_COMM_LEN); + __entry->pid = t->pid; + __entry->start = start; + __entry->duration = duration; + ), + + TP_printk("%8s:%d start %llu.%09u duration %llu ns", + __entry->comm, + __entry->pid, + __print_ns_to_secs(__entry->start), + __print_ns_without_secs(__entry->start), + __entry->duration) +); + +TRACE_EVENT(softirq_noise, + + TP_PROTO(int vector, u64 start, u64 duration), + + TP_ARGS(vector, start, duration), + + TP_STRUCT__entry( + __field( u64, start ) + __field( u64, duration) + __field( int, vector ) + ), + + TP_fast_assign( + __entry->vector = vector; + __entry->start = start; + __entry->duration = duration; + ), + + TP_printk("%8s:%d start %llu.%09u duration %llu ns", + show_softirq_name(__entry->vector), + __entry->vector, + __print_ns_to_secs(__entry->start), + __print_ns_without_secs(__entry->start), + __entry->duration) +); + +TRACE_EVENT(irq_noise, + + TP_PROTO(int vector, const char *desc, u64 start, u64 duration), + + TP_ARGS(vector, desc, start, duration), + + TP_STRUCT__entry( + __field( u64, start ) + __field( u64, duration) + __string( desc, desc ) + __field( int, vector ) + + ), + + TP_fast_assign( + __assign_str(desc, desc); + __entry->vector = vector; + __entry->start = start; + __entry->duration = duration; + ), + + TP_printk("%s:%d start %llu.%09u duration %llu ns", + __get_str(desc), + __entry->vector, + __print_ns_to_secs(__entry->start), + __print_ns_without_secs(__entry->start), + __entry->duration) +); + +TRACE_EVENT(nmi_noise, + + TP_PROTO(u64 start, u64 duration), + + TP_ARGS(start, duration), + + TP_STRUCT__entry( + __field( u64, start ) + __field( u64, duration) + ), + + TP_fast_assign( + __entry->start = start; + __entry->duration = duration; + ), + + TP_printk("start %llu.%09u duration %llu ns", + __print_ns_to_secs(__entry->start), + __print_ns_without_secs(__entry->start), + __entry->duration) +); + +TRACE_EVENT(sample_threshold, + + TP_PROTO(u64 start, u64 duration, u64 interference), + + TP_ARGS(start, duration, interference), + + TP_STRUCT__entry( + __field( u64, start ) + __field( u64, duration) + __field( u64, interference) + ), + + TP_fast_assign( + __entry->start = start; + __entry->duration = duration; + __entry->interference = interference; + ), + + TP_printk("start %llu.%09u duration %llu ns interferences %llu", + __print_ns_to_secs(__entry->start), + __print_ns_without_secs(__entry->start), + __entry->duration, + __entry->interference) +); + +#endif /* _TRACE_OSNOISE_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 29db703f6880..146ad9eec221 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -344,6 +344,40 @@ config HWLAT_TRACER file. Every time a latency is greater than tracing_thresh, it will be recorded into the ring buffer.
+config OSNOISE_TRACER + bool "OS Noise tracer" + select GENERIC_TRACER + help + In the context of high-performance computing (HPC), the Operating + System Noise (osnoise) refers to the interference experienced by an + application due to activities inside the operating system. In the + context of Linux, NMIs, IRQs, SoftIRQs, and any other system thread + can cause noise to the system. Moreover, hardware-related jobs can + also cause noise, for example, via SMIs. + + The osnoise tracer leverages the hwlat_detector by running a similar + loop with preemption, SoftIRQs and IRQs enabled, thus allowing all + the sources of osnoise during its execution. The osnoise tracer takes + note of the entry and exit point of any source of interferences, + increasing a per-cpu interference counter. It saves an interference + counter for each source of interference. The interference counter for + NMI, IRQs, SoftIRQs, and threads is increased anytime the tool + observes these interferences' entry events. When a noise happens + without any interference from the operating system level, the + hardware noise counter increases, pointing to a hardware-related + noise. In this way, osnoise can account for any source of + interference. At the end of the period, the osnoise tracer prints + the sum of all noise, the max single noise, the percentage of CPU + available for the thread, and the counters for the noise sources. + + In addition to the tracer, a set of tracepoints were added to + facilitate the identification of the osnoise source. + + The output will appear in the trace and trace_pipe files. + + To enable this tracer, echo in "osnoise" into the current_tracer + file. + config MMIOTRACE bool "Memory mapped IO tracing" depends on HAVE_MMIOTRACE_SUPPORT && PCI diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 20bf3ada8c94..8ab4d4290101 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -58,6 +58,7 @@ obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o obj-$(CONFIG_HWLAT_TRACER) += trace_hwlat.o +obj-$(CONFIG_OSNOISE_TRACER) += trace_osnoise.o obj-$(CONFIG_NOP_TRACER) += trace_nop.o obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3261691b5756..75fd887496a1 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -44,6 +44,7 @@ enum trace_type { TRACE_BLK, TRACE_BPUTS, TRACE_HWLAT, + TRACE_OSNOISE, TRACE_RAW_DATA,
__TRACE_LAST_TYPE, @@ -304,7 +305,8 @@ struct trace_array { struct array_buffer max_buffer; bool allocated_snapshot; #endif -#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) +#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \ + || defined(CONFIG_OSNOISE_TRACER) unsigned long max_latency; #ifdef CONFIG_FSNOTIFY struct dentry *d_max_latency; @@ -450,6 +452,7 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \ IF_ASSIGN(var, ent, struct hwlat_entry, TRACE_HWLAT); \ + IF_ASSIGN(var, ent, struct osnoise_entry, TRACE_OSNOISE);\ IF_ASSIGN(var, ent, struct raw_data_entry, TRACE_RAW_DATA);\ IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ TRACE_MMIO_RW); \ @@ -828,8 +831,8 @@ void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); #endif /* CONFIG_TRACER_MAX_TRACE */
-#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \ - defined(CONFIG_FSNOTIFY) +#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) || \ + defined(CONFIG_OSNOISE_TRACER)) && defined(CONFIG_FSNOTIFY)
void latency_fsnotify(struct trace_array *tr);
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 18c4a58aff79..f689b7e5c29d 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -338,3 +338,28 @@ FTRACE_ENTRY(hwlat, hwlat_entry, __entry->nmi_total_ts, __entry->nmi_count) ); + +FTRACE_ENTRY(osnoise, osnoise_entry, + + TRACE_OSNOISE, + + F_STRUCT( + __field( u64, noise ) + __field( u64, runtime ) + __field( u64, max_sample ) + __field( unsigned int, hw_count ) + __field( unsigned int, nmi_count ) + __field( unsigned int, irq_count ) + __field( unsigned int, softirq_count ) + __field( unsigned int, thread_count ) + ), + + F_printk("noise:%llu\tmax_sample:%llu\thw:%u\tnmi:%u\tirq:%u\tsoftirq:%u\tthread:%u\n", + __entry->noise, + __entry->max_sample, + __entry->hw_count, + __entry->nmi_count, + __entry->irq_count, + __entry->softirq_count, + __entry->thread_count) +); diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c new file mode 100644 index 000000000000..4e2c47dc4f19 --- /dev/null +++ b/kernel/trace/trace_osnoise.c @@ -0,0 +1,1384 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * OS Noise Tracer: computes the OS Noise suffered by a running thread. + * + * Based on "hwlat_detector" tracer by: + * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. jcm@redhat.com + * Copyright (C) 2013-2016 Steven Rostedt, Red Hat, Inc. srostedt@redhat.com + * With feedback from Clark Williams williams@redhat.com + * + * And also based on the rtsl tracer presented on: + * DE OLIVEIRA, Daniel Bristot, et al. Demystifying the real-time linux + * scheduling latency. In: 32nd Euromicro Conference on Real-Time Systems + * (ECRTS 2020). Schloss Dagstuhl-Leibniz-Zentrum fur Informatik, 2020. + * + * Copyright (C) 2021 Daniel Bristot de Oliveira, Red Hat, Inc. bristot@redhat.com + */ + +#include <linux/kthread.h> +#include <linux/tracefs.h> +#include <linux/uaccess.h> +#include <linux/cpumask.h> +#include <linux/delay.h> +#include <linux/sched/clock.h> +#include <linux/sched.h> +#include "trace.h" + +#ifdef CONFIG_X86_LOCAL_APIC +#include <asm/trace/irq_vectors.h> +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#endif /* CONFIG_X86_LOCAL_APIC */ + +#include <trace/events/irq.h> +#include <trace/events/sched.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/osnoise.h> + +static struct trace_array *osnoise_trace; + +/* + * Default values. + */ +#define BANNER "osnoise: " +#define DEFAULT_SAMPLE_PERIOD 1000000 /* 1s */ +#define DEFAULT_SAMPLE_RUNTIME 1000000 /* 1s */ + +/* + * NMI runtime info. + */ +struct osn_nmi { + u64 count; + u64 delta_start; +}; + +/* + * IRQ runtime info. + */ +struct osn_irq { + u64 count; + u64 arrival_time; + u64 delta_start; +}; + +/* + * sofirq runtime info. + */ +struct osn_softirq { + u64 count; + u64 arrival_time; + u64 delta_start; +}; + +/* + * thread runtime info. + */ +struct osn_thread { + u64 count; + u64 arrival_time; + u64 delta_start; +}; + +/* + * Runtime information: this structure saves the runtime information used by + * one sampling thread. + */ +struct osnoise_variables { + struct task_struct *kthread; + bool sampling; + pid_t pid; + struct osn_nmi nmi; + struct osn_irq irq; + struct osn_softirq softirq; + struct osn_thread thread; + local_t int_counter; +}; + +/* + * Per-cpu runtime information. + */ +DEFINE_PER_CPU(struct osnoise_variables, per_cpu_osnoise_var); + +/* + * this_cpu_osn_var - Return the per-cpu osnoise_variables on its relative CPU + */ +static inline struct osnoise_variables *this_cpu_osn_var(void) +{ + return this_cpu_ptr(&per_cpu_osnoise_var); +} + +/* + * osn_var_reset - Reset the values of the given osnoise_variables + */ +static inline void osn_var_reset(struct osnoise_variables *osn_var) +{ + /* + * So far, all the values are initialized as 0, so + * zeroing the structure is perfect. + */ + memset(osn_var, 0, sizeof(*osn_var)); +} + +/* + * osn_var_reset_all - Reset the value of all per-cpu osnoise_variables + */ +static inline void osn_var_reset_all(void) +{ + struct osnoise_variables *osn_var; + int cpu; + + for_each_cpu(cpu, cpu_online_mask) { + osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu); + osn_var_reset(osn_var); + } +} + +/* + * Tells NMIs to call back to the osnoise tracer to record timestamps. + */ +bool trace_osnoise_callback_enabled; + +/* + * osnoise sample structure definition. Used to store the statistics of a + * sample run. + */ +struct osnoise_sample { + u64 runtime; /* runtime */ + u64 noise; /* noise */ + u64 max_sample; /* max single noise sample */ + int hw_count; /* # HW (incl. hypervisor) interference */ + int nmi_count; /* # NMIs during this sample */ + int irq_count; /* # IRQs during this sample */ + int softirq_count; /* # softirqs during this sample */ + int thread_count; /* # threads during this sample */ +}; + +/* + * Protect the interface. + */ +struct mutex interface_lock; + +/* + * Tracer data. + */ +static struct osnoise_data { + u64 sample_period; /* total sampling period */ + u64 sample_runtime; /* active sampling portion of period */ + u64 stop_tracing; /* stop trace in the inside operation (loop) */ + u64 stop_tracing_total; /* stop trace in the outside operation (report) */ + bool tainted; /* infor users and developers about a problem */ +} osnoise_data = { + .sample_period = DEFAULT_SAMPLE_PERIOD, + .sample_runtime = DEFAULT_SAMPLE_RUNTIME, + .stop_tracing = 0, + .stop_tracing_total = 0, +}; + +/* + * Boolean variable used to inform that the tracer is currently sampling. + */ +static bool osnoise_busy; + +/* + * Print the osnoise header info. + */ +static void print_osnoise_headers(struct seq_file *s) +{ + if (osnoise_data.tainted) + seq_puts(s, "# osnoise is tainted!\n"); + + seq_puts(s, "# _-----=> irqs-off\n"); + seq_puts(s, "# / _----=> need-resched\n"); + seq_puts(s, "# | / _---=> hardirq/softirq\n"); + seq_puts(s, "# || / _--=> preempt-depth "); + seq_puts(s, " MAX\n"); + + seq_puts(s, "# || / "); + seq_puts(s, " SINGLE Interference counters:\n"); + + seq_puts(s, "# |||| RUNTIME "); + seq_puts(s, " NOISE %% OF CPU NOISE +-----------------------------+\n"); + + seq_puts(s, "# TASK-PID CPU# |||| TIMESTAMP IN US "); + seq_puts(s, " IN US AVAILABLE IN US HW NMI IRQ SIRQ THREAD\n"); + + seq_puts(s, "# | | | |||| | | "); + seq_puts(s, " | | | | | | | |\n"); +} + +/* + * osnoise_taint - report an osnoise error. + */ +#define osnoise_taint(msg) ({ \ + struct trace_array *tr = osnoise_trace; \ + \ + trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, msg); \ + osnoise_data.tainted = true; \ +}) + +/* + * Record an osnoise_sample into the tracer buffer. + */ +static void trace_osnoise_sample(struct osnoise_sample *sample) +{ + struct trace_array *tr = osnoise_trace; + struct trace_buffer *buffer = tr->array_buffer.buffer; + struct trace_event_call *call = &event_osnoise; + struct ring_buffer_event *event; + struct osnoise_entry *entry; + + event = trace_buffer_lock_reserve(buffer, TRACE_OSNOISE, sizeof(*entry), + tracing_gen_ctx()); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->runtime = sample->runtime; + entry->noise = sample->noise; + entry->max_sample = sample->max_sample; + entry->hw_count = sample->hw_count; + entry->nmi_count = sample->nmi_count; + entry->irq_count = sample->irq_count; + entry->softirq_count = sample->softirq_count; + entry->thread_count = sample->thread_count; + + if (!call_filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit_nostack(buffer, event); +} + +/* + * Macros to encapsulate the time capturing infrastructure. + */ +#define time_get() trace_clock_local() +#define time_to_us(x) div_u64(x, 1000) +#define time_sub(a, b) ((a) - (b)) + +/* + * cond_move_irq_delta_start - Forward the delta_start of a running IRQ + * + * If an IRQ is preempted by an NMI, its delta_start is pushed forward + * to discount the NMI interference. + * + * See get_int_safe_duration(). + */ +static inline void +cond_move_irq_delta_start(struct osnoise_variables *osn_var, u64 duration) +{ + if (osn_var->irq.delta_start) + osn_var->irq.delta_start += duration; +} + +#ifndef CONFIG_PREEMPT_RT +/* + * cond_move_softirq_delta_start - Forward the delta_start of a running softirq. + * + * If a softirq is preempted by an IRQ or NMI, its delta_start is pushed + * forward to discount the interference. + * + * See get_int_safe_duration(). + */ +static inline void +cond_move_softirq_delta_start(struct osnoise_variables *osn_var, u64 duration) +{ + if (osn_var->softirq.delta_start) + osn_var->softirq.delta_start += duration; +} +#else /* CONFIG_PREEMPT_RT */ +#define cond_move_softirq_delta_start(osn_var, duration) do {} while (0) +#endif + +/* + * cond_move_thread_delta_start - Forward the delta_start of a running thread + * + * If a noisy thread is preempted by an softirq, IRQ or NMI, its delta_start + * is pushed forward to discount the interference. + * + * See get_int_safe_duration(). + */ +static inline void +cond_move_thread_delta_start(struct osnoise_variables *osn_var, u64 duration) +{ + if (osn_var->thread.delta_start) + osn_var->thread.delta_start += duration; +} + +/* + * get_int_safe_duration - Get the duration of a window + * + * The irq, softirq and thread varaibles need to have its duration without + * the interference from higher priority interrupts. Instead of keeping a + * variable to discount the interrupt interference from these variables, the + * starting time of these variables are pushed forward with the interrupt's + * duration. In this way, a single variable is used to: + * + * - Know if a given window is being measured. + * - Account its duration. + * - Discount the interference. + * + * To avoid getting inconsistent values, e.g.,: + * + * now = time_get() + * ---> interrupt! + * delta_start -= int duration; + * <--- + * duration = now - delta_start; + * + * result: negative duration if the variable duration before the + * interrupt was smaller than the interrupt execution. + * + * A counter of interrupts is used. If the counter increased, try + * to capture an interference safe duration. + */ +static inline s64 +get_int_safe_duration(struct osnoise_variables *osn_var, u64 *delta_start) +{ + u64 int_counter, now; + s64 duration; + + do { + int_counter = local_read(&osn_var->int_counter); + /* synchronize with interrupts */ + barrier(); + + now = time_get(); + duration = (now - *delta_start); + + /* synchronize with interrupts */ + barrier(); + } while (int_counter != local_read(&osn_var->int_counter)); + + /* + * This is an evidence of race conditions that cause + * a value to be "discounted" too much. + */ + if (duration < 0) + osnoise_taint("Negative duration!\n"); + + *delta_start = 0; + + return duration; +} + +/* + * + * set_int_safe_time - Save the current time on *time, aware of interference + * + * Get the time, taking into consideration a possible interference from + * higher priority interrupts. + * + * See get_int_safe_duration() for an explanation. + */ +static u64 +set_int_safe_time(struct osnoise_variables *osn_var, u64 *time) +{ + u64 int_counter; + + do { + int_counter = local_read(&osn_var->int_counter); + /* synchronize with interrupts */ + barrier(); + + *time = time_get(); + + /* synchronize with interrupts */ + barrier(); + } while (int_counter != local_read(&osn_var->int_counter)); + + return int_counter; +} + +/* + * trace_osnoise_callback - NMI entry/exit callback + * + * This function is called at the entry and exit NMI code. The bool enter + * distinguishes between either case. This function is used to note a NMI + * occurrence, compute the noise caused by the NMI, and to remove the noise + * it is potentially causing on other interference variables. + */ +void trace_osnoise_callback(bool enter) +{ + struct osnoise_variables *osn_var = this_cpu_osn_var(); + u64 duration; + + if (!osn_var->sampling) + return; + + /* + * Currently trace_clock_local() calls sched_clock() and the + * generic version is not NMI safe. + */ + if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) { + if (enter) { + osn_var->nmi.delta_start = time_get(); + local_inc(&osn_var->int_counter); + } else { + duration = time_get() - osn_var->nmi.delta_start; + + trace_nmi_noise(osn_var->nmi.delta_start, duration); + + cond_move_irq_delta_start(osn_var, duration); + cond_move_softirq_delta_start(osn_var, duration); + cond_move_thread_delta_start(osn_var, duration); + } + } + + if (enter) + osn_var->nmi.count++; +} + +/* + * osnoise_trace_irq_entry - Note the starting of an IRQ + * + * Save the starting time of an IRQ. As IRQs are non-preemptive to other IRQs, + * it is safe to use a single variable (ons_var->irq) to save the statistics. + * The arrival_time is used to report... the arrival time. The delta_start + * is used to compute the duration at the IRQ exit handler. See + * cond_move_irq_delta_start(). + */ +void osnoise_trace_irq_entry(int id) +{ + struct osnoise_variables *osn_var = this_cpu_osn_var(); + + if (!osn_var->sampling) + return; + /* + * This value will be used in the report, but not to compute + * the execution time, so it is safe to get it unsafe. + */ + osn_var->irq.arrival_time = time_get(); + set_int_safe_time(osn_var, &osn_var->irq.delta_start); + osn_var->irq.count++; + + local_inc(&osn_var->int_counter); +} + +/* + * osnoise_irq_exit - Note the end of an IRQ, sava data and trace + * + * Computes the duration of the IRQ noise, and trace it. Also discounts the + * interference from other sources of noise could be currently being accounted. + */ +void osnoise_trace_irq_exit(int id, const char *desc) +{ + struct osnoise_variables *osn_var = this_cpu_osn_var(); + int duration; + + if (!osn_var->sampling) + return; + + duration = get_int_safe_duration(osn_var, &osn_var->irq.delta_start); + trace_irq_noise(id, desc, osn_var->irq.arrival_time, duration); + osn_var->irq.arrival_time = 0; + cond_move_softirq_delta_start(osn_var, duration); + cond_move_thread_delta_start(osn_var, duration); +} + +/* + * trace_irqentry_callback - Callback to the irq:irq_entry traceevent + * + * Used to note the starting of an IRQ occurece. + */ +static void trace_irqentry_callback(void *data, int irq, + struct irqaction *action) +{ + osnoise_trace_irq_entry(irq); +} + +/* + * trace_irqexit_callback - Callback to the irq:irq_exit traceevent + * + * Used to note the end of an IRQ occurece. + */ +static void trace_irqexit_callback(void *data, int irq, + struct irqaction *action, int ret) +{ + osnoise_trace_irq_exit(irq, action->name); +} + +/* + * arch specific register function. + */ +int __weak osnoise_arch_register(void) +{ + return 0; +} + +/* + * arch specific unregister function. + */ +void __weak osnoise_arch_unregister(void) +{ + return; +} + +/* + * hook_irq_events - Hook IRQ handling events + * + * This function hooks the IRQ related callbacks to the respective trace + * events. + */ +int hook_irq_events(void) +{ + int ret; + + ret = register_trace_irq_handler_entry(trace_irqentry_callback, NULL); + if (ret) + goto out_err; + + ret = register_trace_irq_handler_exit(trace_irqexit_callback, NULL); + if (ret) + goto out_unregister_entry; + + ret = osnoise_arch_register(); + if (ret) + goto out_irq_exit; + + return 0; + +out_irq_exit: + unregister_trace_irq_handler_exit(trace_irqexit_callback, NULL); +out_unregister_entry: + unregister_trace_irq_handler_entry(trace_irqentry_callback, NULL); +out_err: + return -EINVAL; +} + +/* + * unhook_irq_events - Unhook IRQ handling events + * + * This function unhooks the IRQ related callbacks to the respective trace + * events. + */ +void unhook_irq_events(void) +{ + osnoise_arch_unregister(); + unregister_trace_irq_handler_exit(trace_irqexit_callback, NULL); + unregister_trace_irq_handler_entry(trace_irqentry_callback, NULL); +} + +#ifndef CONFIG_PREEMPT_RT +/* + * trace_softirq_entry_callback - Note the starting of a softirq + * + * Save the starting time of a softirq. As softirqs are non-preemptive to + * other softirqs, it is safe to use a single variable (ons_var->softirq) + * to save the statistics. The arrival_time is used to report... the + * arrival time. The delta_start is used to compute the duration at the + * softirq exit handler. See cond_move_softirq_delta_start(). + */ +void trace_softirq_entry_callback(void *data, unsigned int vec_nr) +{ + struct osnoise_variables *osn_var = this_cpu_osn_var(); + + if (!osn_var->sampling) + return; + /* + * This value will be used in the report, but not to compute + * the execution time, so it is safe to get it unsafe. + */ + osn_var->softirq.arrival_time = time_get(); + set_int_safe_time(osn_var, &osn_var->softirq.delta_start); + osn_var->softirq.count++; + + local_inc(&osn_var->int_counter); +} + +/* + * trace_softirq_exit_callback - Note the end of an softirq + * + * Computes the duration of the softirq noise, and trace it. Also discounts the + * interference from other sources of noise could be currently being accounted. + */ +void trace_softirq_exit_callback(void *data, unsigned int vec_nr) +{ + struct osnoise_variables *osn_var = this_cpu_osn_var(); + int duration; + + if (!osn_var->sampling) + return; + + duration = get_int_safe_duration(osn_var, &osn_var->softirq.delta_start); + trace_softirq_noise(vec_nr, osn_var->softirq.arrival_time, duration); + cond_move_thread_delta_start(osn_var, duration); + osn_var->softirq.arrival_time = 0; +} + +/* + * hook_softirq_events - Hook softirq handling events + * + * This function hooks the softirq related callbacks to the respective trace + * events. + */ +static int hook_softirq_events(void) +{ + int ret; + + ret = register_trace_softirq_entry(trace_softirq_entry_callback, NULL); + if (ret) + goto out_err; + + ret = register_trace_softirq_exit(trace_softirq_exit_callback, NULL); + if (ret) + goto out_unreg_entry; + + return 0; + +out_unreg_entry: + unregister_trace_softirq_entry(trace_softirq_entry_callback, NULL); +out_err: + return -EINVAL; +} + +/* + * unhook_softirq_events - Unhook softirq handling events + * + * This function hooks the softirq related callbacks to the respective trace + * events. + */ +static void unhook_softirq_events(void) +{ + unregister_trace_softirq_entry(trace_softirq_entry_callback, NULL); + unregister_trace_softirq_exit(trace_softirq_exit_callback, NULL); +} +#else /* CONFIG_PREEMPT_RT */ +/* + * softirq are threads on the PREEMPT_RT mode. + */ +static int hook_softirq_events(void) +{ + return 0; +} +static void unhook_softirq_events(void) +{ +} +#endif + +/* + * thread_entry - Record the starting of a thread noise window + * + * It saves the context switch time for a noisy thread, and increments + * the interference counters. + */ +static void +thread_entry(struct osnoise_variables *osn_var, struct task_struct *t) +{ + if (!osn_var->sampling) + return; + /* + * The arrival time will be used in the report, but not to compute + * the execution time, so it is safe to get it unsafe. + */ + osn_var->thread.arrival_time = time_get(); + + set_int_safe_time(osn_var, &osn_var->thread.delta_start); + + osn_var->thread.count++; + local_inc(&osn_var->int_counter); +} + +/* + * thread_exit - Report the end of a thread noise window + * + * It computes the total noise from a thread, tracing if needed. + */ +static void +thread_exit(struct osnoise_variables *osn_var, struct task_struct *t) +{ + int duration; + + if (!osn_var->sampling) + return; + + duration = get_int_safe_duration(osn_var, &osn_var->thread.delta_start); + + trace_thread_noise(t, osn_var->thread.arrival_time, duration); + + osn_var->thread.arrival_time = 0; +} + +/* + * trace_sched_switch - sched:sched_switch trace event handler + * + * This function is hooked to the sched:sched_switch trace event, and it is + * used to record the beginning and to report the end of a thread noise window. + */ +void +trace_sched_switch_callback(void *data, bool preempt, struct task_struct *p, + struct task_struct *n) +{ + struct osnoise_variables *osn_var = this_cpu_osn_var(); + + if (p->pid != osn_var->pid) + thread_exit(osn_var, p); + + if (n->pid != osn_var->pid) + thread_entry(osn_var, n); +} + +/* + * hook_thread_events - Hook the insturmentation for thread noise + * + * Hook the osnoise tracer callbacks to handle the noise from other + * threads on the necessary kernel events. + */ +int hook_thread_events(void) +{ + int ret; + + ret = register_trace_sched_switch(trace_sched_switch_callback, NULL); + if (ret) + return -EINVAL; + + return 0; +} + +/* + * unhook_thread_events - *nhook the insturmentation for thread noise + * + * Unook the osnoise tracer callbacks to handle the noise from other + * threads on the necessary kernel events. + */ +void unhook_thread_events(void) +{ + unregister_trace_sched_switch(trace_sched_switch_callback, NULL); +} + +/* + * save_osn_sample_stats - Save the osnoise_sample statistics + * + * Save the osnoise_sample statistics before the sampling phase. These + * values will be used later to compute the diff betwneen the statistics + * before and after the osnoise sampling. + */ +void save_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *s) +{ + s->nmi_count = osn_var->nmi.count; + s->irq_count = osn_var->irq.count; + s->softirq_count = osn_var->softirq.count; + s->thread_count = osn_var->thread.count; +} + +/* + * diff_osn_sample_stats - Compute the osnoise_sample statistics + * + * After a sample period, compute the difference on the osnoise_sample + * statistics. The struct osnoise_sample *s contains the statistics saved via + * save_osn_sample_stats() before the osnoise sampling. + */ +void diff_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *s) +{ + s->nmi_count = osn_var->nmi.count - s->nmi_count; + s->irq_count = osn_var->irq.count - s->irq_count; + s->softirq_count = osn_var->softirq.count - s->softirq_count; + s->thread_count = osn_var->thread.count - s->thread_count; +} + +/* + * osnoise_stop_tracing - Stop tracing and the tracer. + */ +static void osnoise_stop_tracing(void) +{ + struct trace_array *tr = osnoise_trace; + tracer_tracing_off(tr); +} + +/* + * run_osnoise - Sample the time and look for osnoise + * + * Used to capture the time, looking for potential osnoise latency repeatedly. + * Different from hwlat_detector, it is called with preemption and interrupts + * enabled. This allows irqs, softirqs and threads to run, interfering on the + * osnoise sampling thread, as they would do with a regular thread. + */ +static int run_osnoise(void) +{ + struct osnoise_variables *osn_var = this_cpu_osn_var(); + u64 noise = 0, sum_noise = 0, max_noise = 0; + struct trace_array *tr = osnoise_trace; + u64 start, sample, last_sample; + u64 last_int_count, int_count; + s64 total, last_total = 0; + struct osnoise_sample s; + unsigned int threshold; + int hw_count = 0; + u64 runtime, stop_in; + int ret = -1; + + /* + * Considers the current thread as the workload. + */ + osn_var->pid = current->pid; + + /* + * Save the current stats for the diff + */ + save_osn_sample_stats(osn_var, &s); + + /* + * if threshold is 0, use the default value of 5 us. + */ + threshold = tracing_thresh ? : 5000; + + /* + * Make sure NMIs see sampling first + */ + osn_var->sampling = true; + barrier(); + + /* + * Transform the *_us config to nanoseconds to avoid the + * division on the main loop. + */ + runtime = osnoise_data.sample_runtime * NSEC_PER_USEC; + stop_in = osnoise_data.stop_tracing * NSEC_PER_USEC; + + /* + * Start timestemp + */ + start = time_get(); + + /* + * "previous" loop. + */ + last_int_count = set_int_safe_time(osn_var, &last_sample); + + do { + /* + * Get sample! + */ + int_count = set_int_safe_time(osn_var, &sample); + + noise = time_sub(sample, last_sample); + + /* + * This shouldn't happen. + */ + if (noise < 0) { + osnoise_taint("negative noise!"); + goto out; + } + + /* + * Sample runtime. + */ + total = time_sub(sample, start); + + /* + * Check for possible overflows. + */ + if (total < last_total) { + osnoise_taint("total overflow!"); + break; + } + + last_total = total; + + if (noise >= threshold) { + int interference = int_count - last_int_count; + + if (noise > max_noise) + max_noise = noise; + + if (!interference) + hw_count++; + + sum_noise += noise; + + trace_sample_threshold(last_sample, noise, interference); + + if (osnoise_data.stop_tracing) + if (noise > stop_in) + osnoise_stop_tracing(); + } + + /* + * For the non-preemptive kernel config: let threads runs, if + * they so wish. + */ + cond_resched(); + + last_sample = sample; + last_int_count = int_count; + + } while (total < runtime && !kthread_should_stop()); + + /* + * Finish the above in the view for interrupts. + */ + barrier(); + + osn_var->sampling = false; + + /* + * Make sure sampling data is no longer updated. + */ + barrier(); + + /* + * Save noise info. + */ + s.noise = time_to_us(sum_noise); + s.runtime = time_to_us(total); + s.max_sample = time_to_us(max_noise); + s.hw_count = hw_count; + + /* Save interference stats info */ + diff_osn_sample_stats(osn_var, &s); + + trace_osnoise_sample(&s); + + /* Keep a running maximum ever recorded osnoise "latency" */ + if (max_noise > tr->max_latency) { + tr->max_latency = max_noise; + latency_fsnotify(tr); + } + + if (osnoise_data.stop_tracing_total) + if (s.noise > osnoise_data.stop_tracing_total) + osnoise_stop_tracing(); + + return 0; +out: + return ret; +} + +static struct cpumask osnoise_cpumask; +static struct cpumask save_cpumask; + +/* + * osnoise_main - The osnoise detection kernel thread + * + * Calls run_osnoise() function to measure the osnoise for the configured runtime, + * every period. + */ +static int osnoise_main(void *data) +{ + s64 interval; + + while (!kthread_should_stop()) { + + run_osnoise(); + + mutex_lock(&interface_lock); + interval = osnoise_data.sample_period - osnoise_data.sample_runtime; + mutex_unlock(&interface_lock); + + do_div(interval, USEC_PER_MSEC); + + /* + * differently from hwlat_detector, the osnoise tracer can run + * without a pause because preemption is on. + */ + if (interval < 1) + continue; + + if (msleep_interruptible(interval)) + break; + } + + return 0; +} + +/* + * stop_per_cpu_kthread - stop per-cpu threads + * + * Stop the osnoise sampling htread. Use this on unload and at system + * shutdown. + */ +static void stop_per_cpu_kthreads(void) +{ + struct task_struct *kthread; + int cpu; + + for_each_online_cpu(cpu) { + kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread; + if (kthread) + kthread_stop(kthread); + per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; + } +} + +/* + * start_per_cpu_kthread - Kick off per-cpu osnoise sampling kthreads + * + * This starts the kernel thread that will look for osnoise on many + * cpus. + */ +static int start_per_cpu_kthreads(struct trace_array *tr) +{ + struct cpumask *current_mask = &save_cpumask; + struct task_struct *kthread; + char comm[24]; + int cpu; + + get_online_cpus(); + /* + * Run only on CPUs in which trace and osnoise are allowed to run. + */ + cpumask_and(current_mask, tr->tracing_cpumask, &osnoise_cpumask); + /* + * And the CPU is online. + */ + cpumask_and(current_mask, cpu_online_mask, current_mask); + put_online_cpus(); + + for_each_online_cpu(cpu) + per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; + + for_each_cpu(cpu, current_mask) { + snprintf(comm, 24, "osnoise/%d", cpu); + + kthread = kthread_create_on_cpu(osnoise_main, NULL, cpu, comm); + + if (IS_ERR(kthread)) { + pr_err(BANNER "could not start sampling thread\n"); + stop_per_cpu_kthreads(); + return -ENOMEM; + } + + per_cpu(per_cpu_osnoise_var, cpu).kthread = kthread; + wake_up_process(kthread); + } + + return 0; +} + +/* + * osnoise_cpus_read - Read function for reading the "cpus" file + * @filp: The active open file structure + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * + * Prints the "cpus" output into the user-provided buffer. + */ +static ssize_t +osnoise_cpus_read(struct file *filp, char __user *ubuf, size_t count, + loff_t *ppos) +{ + char *mask_str; + int len; + + mutex_lock(&interface_lock); + + len = snprintf(NULL, 0, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask)) + 1; + mask_str = kmalloc(len, GFP_KERNEL); + if (!mask_str) { + count = -ENOMEM; + goto out_unlock; + } + + len = snprintf(mask_str, len, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask)); + if (len >= count) { + count = -EINVAL; + goto out_free; + } + + count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len); + +out_free: + kfree(mask_str); +out_unlock: + mutex_unlock(&interface_lock); + + return count; +} + +static void osnoise_tracer_start(struct trace_array *tr); +static void osnoise_tracer_stop(struct trace_array *tr); + +/* + * osnoise_cpus_write - Write function for "cpus" entry + * @filp: The active open file structure + * @ubuf: The user buffer that contains the value to write + * @cnt: The maximum number of bytes to write to "file" + * @ppos: The current position in @file + * + * This function provides a write implementation for the "cpus" + * interface to the osnoise trace. By default, it lists all CPUs, + * in this way, allowing osnoise threads to run on any online CPU + * of the system. It serves to restrict the execution of osnoise to the + * set of CPUs writing via this interface. Note that osnoise also + * respects the "tracing_cpumask." Hence, osnoise threads will run only + * on the set of CPUs allowed here AND on "tracing_cpumask." Why not + * have just "tracing_cpumask?" Because the user might be interested + * in tracing what is running on other CPUs. For instance, one might + * run osnoise in one HT CPU while observing what is running on the + * sibling HT CPU. + */ +static ssize_t +osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, + loff_t *ppos) +{ + struct trace_array *tr = osnoise_trace; + cpumask_var_t osnoise_cpumask_new; + int running, err; + char buf[256]; + + if (count >= 256) + return -EINVAL; + + if (copy_from_user(buf, ubuf, count)) + return -EFAULT; + + if (!zalloc_cpumask_var(&osnoise_cpumask_new, GFP_KERNEL)) + return -ENOMEM; + + err = cpulist_parse(buf, osnoise_cpumask_new); + if (err) + goto err_free; + + /* + * trace_types_lock is taken to avoid concurrency on start/stop + * and osnoise_busy. + */ + mutex_lock(&trace_types_lock); + running = osnoise_busy; + if (running) + osnoise_tracer_stop(tr); + + mutex_lock(&interface_lock); + cpumask_copy(&osnoise_cpumask, osnoise_cpumask_new); + mutex_unlock(&interface_lock); + + if (running) + osnoise_tracer_start(tr); + mutex_unlock(&trace_types_lock); + + free_cpumask_var(osnoise_cpumask_new); + return count; + +err_free: + free_cpumask_var(osnoise_cpumask_new); + + return err; +} + +/* + * osnoise/runtime_us: cannot be greater than the period. + */ +static struct trace_min_max_param osnoise_runtime = { + .lock = &interface_lock, + .val = &osnoise_data.sample_runtime, + .max = &osnoise_data.sample_period, + .min = NULL, +}; + +/* + * osnoise/period_us: cannot be smaller than the runtime. + */ +static struct trace_min_max_param osnoise_period = { + .lock = &interface_lock, + .val = &osnoise_data.sample_period, + .max = NULL, + .min = &osnoise_data.sample_runtime, +}; + +/* + * osnoise/stop_tracing_us: no limit. + */ +static struct trace_min_max_param osnoise_stop_tracing_in = { + .lock = &interface_lock, + .val = &osnoise_data.stop_tracing, + .max = NULL, + .min = NULL, +}; + +/* + * osnoise/stop_tracing_total_us: no limit. + */ +static struct trace_min_max_param osnoise_stop_tracing_total = { + .lock = &interface_lock, + .val = &osnoise_data.stop_tracing_total, + .max = NULL, + .min = NULL, +}; + +static const struct file_operations cpus_fops = { + .open = tracing_open_generic, + .read = osnoise_cpus_read, + .write = osnoise_cpus_write, + .llseek = generic_file_llseek, +}; + +/* + * init_tracefs - A function to initialize the tracefs interface files + * + * This function creates entries in tracefs for "osnoise". It creates the + * "osnoise" directory in the tracing directory, and within that + * directory is the count, runtime and period files to change and view + * those values. + */ +static int init_tracefs(void) +{ + struct dentry *top_dir; + struct dentry *tmp; + int ret; + + ret = tracing_init_dentry(); + if (ret) + return -ENOMEM; + + top_dir = tracefs_create_dir("osnoise", NULL); + if (!top_dir) + return -ENOMEM; + + tmp = tracefs_create_file("period_us", 0640, top_dir, + &osnoise_period, &trace_min_max_fops); + if (!tmp) + goto err; + + tmp = tracefs_create_file("runtime_us", 0644, top_dir, + &osnoise_runtime, &trace_min_max_fops); + if (!tmp) + goto err; + + tmp = tracefs_create_file("stop_tracing_us", 0640, top_dir, + &osnoise_stop_tracing_in, &trace_min_max_fops); + if (!tmp) + goto err; + + tmp = tracefs_create_file("stop_tracing_total_us", 0640, top_dir, + &osnoise_stop_tracing_total, &trace_min_max_fops); + if (!tmp) + goto err; + + tmp = trace_create_file("cpus", 0644, top_dir, NULL, &cpus_fops); + if (!tmp) + goto err; + + return 0; + +err: + tracefs_remove(top_dir); + return -ENOMEM; +} + +static int osnoise_hook_events(void) +{ + int retval; + + /* + * Trace is already hooked, we are re-enabling from + * a stop_tracing_*. + */ + if (trace_osnoise_callback_enabled) + return 0; + + retval = hook_irq_events(); + if (retval) + return -EINVAL; + + retval = hook_softirq_events(); + if (retval) + goto out_unhook_irq; + + retval = hook_thread_events(); + /* + * All fine! + */ + if (!retval) + return 0; + + unhook_softirq_events(); +out_unhook_irq: + unhook_irq_events(); + return -EINVAL; +} + +static void osnoise_tracer_start(struct trace_array *tr) +{ + int retval; + + if (osnoise_busy) + return; + + osn_var_reset_all(); + + retval = osnoise_hook_events(); + if (retval) + goto out_err; + /* + * Make sure NMIs see reseted values. + */ + barrier(); + trace_osnoise_callback_enabled = true; + + retval = start_per_cpu_kthreads(tr); + /* + * all fine! + */ + if (!retval) + return; + +out_err: + unhook_irq_events(); + pr_err(BANNER "Error starting osnoise tracer\n"); +} + +static void osnoise_tracer_stop(struct trace_array *tr) +{ + if (!osnoise_busy) + return; + + trace_osnoise_callback_enabled = false; + barrier(); + + stop_per_cpu_kthreads(); + + unhook_irq_events(); + unhook_softirq_events(); + unhook_thread_events(); + + osnoise_busy = false; +} + +static int osnoise_tracer_init(struct trace_array *tr) +{ + /* Only allow one instance to enable this */ + if (osnoise_busy) + return -EBUSY; + + osnoise_trace = tr; + + tr->max_latency = 0; + + osnoise_tracer_start(tr); + + osnoise_busy = true; + + return 0; +} + +static void osnoise_tracer_reset(struct trace_array *tr) +{ + osnoise_tracer_stop(tr); +} + +static struct tracer osnoise_tracer __read_mostly = { + .name = "osnoise", + .init = osnoise_tracer_init, + .reset = osnoise_tracer_reset, + .start = osnoise_tracer_start, + .stop = osnoise_tracer_stop, + .print_header = print_osnoise_headers, + .allow_instances = true, +}; + +__init static int init_osnoise_tracer(void) +{ + int ret; + + mutex_init(&interface_lock); + + cpumask_copy(&osnoise_cpumask, cpu_all_mask); + + ret = register_tracer(&osnoise_tracer); + if (ret) + return ret; + + init_tracefs(); + + return 0; +} +late_initcall(init_osnoise_tracer); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 000e9dc224c6..f1dce388121a 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1179,7 +1179,6 @@ trace_hwlat_print(struct trace_iterator *iter, int flags, return trace_handle_return(s); }
- static enum print_line_t trace_hwlat_raw(struct trace_iterator *iter, int flags, struct trace_event *event) @@ -1209,6 +1208,76 @@ static struct trace_event trace_hwlat_event = { .funcs = &trace_hwlat_funcs, };
+/* TRACE_OSNOISE */ +static enum print_line_t +trace_osnoise_print(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + struct osnoise_entry *field; + u64 ratio, ratio_dec; + u64 net_runtime; + + trace_assign_type(field, entry); + + /* + * compute the available % of cpu time. + */ + net_runtime = field->runtime - field->noise; + ratio = net_runtime * 10000000; + do_div(ratio, field->runtime); + ratio_dec = do_div(ratio, 100000); + + trace_seq_printf(s, "%llu %10llu %3llu.%05llu %7llu", + field->runtime, + field->noise, + ratio, ratio_dec, + field->max_sample); + + trace_seq_printf(s, " %6u", field->hw_count); + trace_seq_printf(s, " %6u", field->nmi_count); + trace_seq_printf(s, " %6u", field->irq_count); + trace_seq_printf(s, " %6u", field->softirq_count); + trace_seq_printf(s, " %6u", field->thread_count); + + trace_seq_putc(s, '\n'); + + return trace_handle_return(s); +} + +static enum print_line_t +trace_osnoise_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct osnoise_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + trace_seq_printf(s, "%lld %llu %llu %u %u %u %u %u\n", + field->runtime, + field->noise, + field->max_sample, + field->hw_count, + field->nmi_count, + field->irq_count, + field->softirq_count, + field->thread_count); + + return trace_handle_return(s); +} + +static struct trace_event_functions trace_osnoise_funcs = { + .trace = trace_osnoise_print, + .raw = trace_osnoise_raw, +}; + +static struct trace_event trace_osnoise_event = { + .type = TRACE_OSNOISE, + .funcs = &trace_osnoise_funcs, +}; + /* TRACE_BPUTS */ static enum print_line_t trace_bputs_print(struct trace_iterator *iter, int flags, @@ -1374,6 +1443,7 @@ static struct trace_event *events[] __initdata = { &trace_bprint_event, &trace_print_event, &trace_hwlat_event, + &trace_osnoise_event, &trace_raw_data_event, NULL };
From: Daniel Bristot de Oliveira bristot@redhat.com
mainline inclusion from mainline-v5.14-rc1 commit a955d7eac1779b437ceb24fc352026a2cbcec140 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4G64B CVE: NA
-------------------------------------------------
The timerlat tracer aims to help the preemptive kernel developers to found souces of wakeup latencies of real-time threads. Like cyclictest, the tracer sets a periodic timer that wakes up a thread. The thread then computes a *wakeup latency* value as the difference between the *current time* and the *absolute time* that the timer was set to expire. The main goal of timerlat is tracing in such a way to help kernel developers.
Usage
Write the ASCII text "timerlat" into the current_tracer file of the tracing system (generally mounted at /sys/kernel/tracing).
For example:
[root@f32 ~]# cd /sys/kernel/tracing/ [root@f32 tracing]# echo timerlat > current_tracer
It is possible to follow the trace by reading the trace trace file:
[root@f32 tracing]# cat trace # tracer: timerlat # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # || / # |||| ACTIVATION # TASK-PID CPU# |||| TIMESTAMP ID CONTEXT LATENCY # | | | |||| | | | | <idle>-0 [000] d.h1 54.029328: #1 context irq timer_latency 932 ns <...>-867 [000] .... 54.029339: #1 context thread timer_latency 11700 ns <idle>-0 [001] dNh1 54.029346: #1 context irq timer_latency 2833 ns <...>-868 [001] .... 54.029353: #1 context thread timer_latency 9820 ns <idle>-0 [000] d.h1 54.030328: #2 context irq timer_latency 769 ns <...>-867 [000] .... 54.030330: #2 context thread timer_latency 3070 ns <idle>-0 [001] d.h1 54.030344: #2 context irq timer_latency 935 ns <...>-868 [001] .... 54.030347: #2 context thread timer_latency 4351 ns
The tracer creates a per-cpu kernel thread with real-time priority that prints two lines at every activation. The first is the *timer latency* observed at the *hardirq* context before the activation of the thread. The second is the *timer latency* observed by the thread, which is the same level that cyclictest reports. The ACTIVATION ID field serves to relate the *irq* execution to its respective *thread* execution.
The irq/thread splitting is important to clarify at which context the unexpected high value is coming from. The *irq* context can be delayed by hardware related actions, such as SMIs, NMIs, IRQs or by a thread masking interrupts. Once the timer happens, the delay can also be influenced by blocking caused by threads. For example, by postponing the scheduler execution via preempt_disable(), by the scheduler execution, or by masking interrupts. Threads can also be delayed by the interference from other threads and IRQs.
The timerlat can also take advantage of the osnoise: traceevents. For example:
[root@f32 ~]# cd /sys/kernel/tracing/ [root@f32 tracing]# echo timerlat > current_tracer [root@f32 tracing]# echo osnoise > set_event [root@f32 tracing]# echo 25 > osnoise/stop_tracing_total_us [root@f32 tracing]# tail -10 trace cc1-87882 [005] d..h... 548.771078: #402268 context irq timer_latency 1585 ns cc1-87882 [005] dNLh1.. 548.771082: irq_noise: local_timer:236 start 548.771077442 duration 4597 ns cc1-87882 [005] dNLh2.. 548.771083: irq_noise: reschedule:253 start 548.771083017 duration 56 ns cc1-87882 [005] dNLh2.. 548.771086: irq_noise: call_function_single:251 start 548.771083811 duration 2048 ns cc1-87882 [005] dNLh2.. 548.771088: irq_noise: call_function_single:251 start 548.771086814 duration 1495 ns cc1-87882 [005] dNLh2.. 548.771091: irq_noise: call_function_single:251 start 548.771089194 duration 1558 ns cc1-87882 [005] dNLh2.. 548.771094: irq_noise: call_function_single:251 start 548.771091719 duration 1932 ns cc1-87882 [005] dNLh2.. 548.771096: irq_noise: call_function_single:251 start 548.771094696 duration 1050 ns cc1-87882 [005] d...3.. 548.771101: thread_noise: cc1:87882 start 548.771078243 duration 10909 ns timerlat/5-1035 [005] ....... 548.771103: #402268 context thread timer_latency 25960 ns
For further information see: Documentation/trace/timerlat-tracer.rst
Link: https://lkml.kernel.org/r/71f18efc013e1194bcaea1e54db957de2b19ba62.162437231...
Cc: Phil Auld pauld@redhat.com Cc: Sebastian Andrzej Siewior bigeasy@linutronix.de Cc: Kate Carcia kcarcia@redhat.com Cc: Jonathan Corbet corbet@lwn.net Cc: Ingo Molnar mingo@redhat.com Cc: Peter Zijlstra peterz@infradead.org Cc: Thomas Gleixner tglx@linutronix.de Cc: Alexandre Chartre alexandre.chartre@oracle.com Cc: Clark Willaims williams@redhat.com Cc: John Kacur jkacur@redhat.com Cc: Juri Lelli juri.lelli@redhat.com Cc: Borislav Petkov bp@alien8.de Cc: "H. Peter Anvin" hpa@zytor.com Cc: x86@kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira bristot@redhat.com Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- Documentation/trace/index.rst | 1 + Documentation/trace/timerlat-tracer.rst | 181 +++++++ kernel/trace/Kconfig | 28 ++ kernel/trace/trace.h | 2 + kernel/trace/trace_entries.h | 16 + kernel/trace/trace_osnoise.c | 626 ++++++++++++++++++++++-- kernel/trace/trace_output.c | 47 ++ 7 files changed, 870 insertions(+), 31 deletions(-) create mode 100644 Documentation/trace/timerlat-tracer.rst
diff --git a/Documentation/trace/index.rst b/Documentation/trace/index.rst index 608107b27cc0..3769b9b7aed8 100644 --- a/Documentation/trace/index.rst +++ b/Documentation/trace/index.rst @@ -24,6 +24,7 @@ Linux Tracing Technologies boottime-trace hwlat_detector osnoise-tracer + timerlat-tracer intel_th ring-buffer-design stm diff --git a/Documentation/trace/timerlat-tracer.rst b/Documentation/trace/timerlat-tracer.rst new file mode 100644 index 000000000000..c7cbb557aee7 --- /dev/null +++ b/Documentation/trace/timerlat-tracer.rst @@ -0,0 +1,181 @@ +############### +Timerlat tracer +############### + +The timerlat tracer aims to help the preemptive kernel developers to +find souces of wakeup latencies of real-time threads. Like cyclictest, +the tracer sets a periodic timer that wakes up a thread. The thread then +computes a *wakeup latency* value as the difference between the *current +time* and the *absolute time* that the timer was set to expire. The main +goal of timerlat is tracing in such a way to help kernel developers. + +Usage +----- + +Write the ASCII text "timerlat" into the current_tracer file of the +tracing system (generally mounted at /sys/kernel/tracing). + +For example:: + + [root@f32 ~]# cd /sys/kernel/tracing/ + [root@f32 tracing]# echo timerlat > current_tracer + +It is possible to follow the trace by reading the trace trace file:: + + [root@f32 tracing]# cat trace + # tracer: timerlat + # + # _-----=> irqs-off + # / _----=> need-resched + # | / _---=> hardirq/softirq + # || / _--=> preempt-depth + # || / + # |||| ACTIVATION + # TASK-PID CPU# |||| TIMESTAMP ID CONTEXT LATENCY + # | | | |||| | | | | + <idle>-0 [000] d.h1 54.029328: #1 context irq timer_latency 932 ns + <...>-867 [000] .... 54.029339: #1 context thread timer_latency 11700 ns + <idle>-0 [001] dNh1 54.029346: #1 context irq timer_latency 2833 ns + <...>-868 [001] .... 54.029353: #1 context thread timer_latency 9820 ns + <idle>-0 [000] d.h1 54.030328: #2 context irq timer_latency 769 ns + <...>-867 [000] .... 54.030330: #2 context thread timer_latency 3070 ns + <idle>-0 [001] d.h1 54.030344: #2 context irq timer_latency 935 ns + <...>-868 [001] .... 54.030347: #2 context thread timer_latency 4351 ns + + +The tracer creates a per-cpu kernel thread with real-time priority that +prints two lines at every activation. The first is the *timer latency* +observed at the *hardirq* context before the activation of the thread. +The second is the *timer latency* observed by the thread. The ACTIVATION +ID field serves to relate the *irq* execution to its respective *thread* +execution. + +The *irq*/*thread* splitting is important to clarify at which context +the unexpected high value is coming from. The *irq* context can be +delayed by hardware related actions, such as SMIs, NMIs, IRQs +or by a thread masking interrupts. Once the timer happens, the delay +can also be influenced by blocking caused by threads. For example, by +postponing the scheduler execution via preempt_disable(), by the +scheduler execution, or by masking interrupts. Threads can +also be delayed by the interference from other threads and IRQs. + +Tracer options +--------------------- + +The timerlat tracer is built on top of osnoise tracer. +So its configuration is also done in the osnoise/ config +directory. The timerlat configs are: + + - cpus: CPUs at which a timerlat thread will execute. + - timerlat_period_us: the period of the timerlat thread. + - osnoise/stop_tracing_us: stop the system tracing if a + timer latency at the *irq* context higher than the configured + value happens. Writing 0 disables this option. + - stop_tracing_total_us: stop the system tracing if a + timer latency at the *thread* context higher than the configured + value happens. Writing 0 disables this option. + - print_stack: save the stack of the IRQ ocurrence, and print + it afte the *thread context* event". + +timerlat and osnoise +---------------------------- + +The timerlat can also take advantage of the osnoise: traceevents. +For example:: + + [root@f32 ~]# cd /sys/kernel/tracing/ + [root@f32 tracing]# echo timerlat > current_tracer + [root@f32 tracing]# echo 1 > events/osnoise/enable + [root@f32 tracing]# echo 25 > osnoise/stop_tracing_total_us + [root@f32 tracing]# tail -10 trace + cc1-87882 [005] d..h... 548.771078: #402268 context irq timer_latency 13585 ns + cc1-87882 [005] dNLh1.. 548.771082: irq_noise: local_timer:236 start 548.771077442 duration 7597 ns + cc1-87882 [005] dNLh2.. 548.771099: irq_noise: qxl:21 start 548.771085017 duration 7139 ns + cc1-87882 [005] d...3.. 548.771102: thread_noise: cc1:87882 start 548.771078243 duration 9909 ns + timerlat/5-1035 [005] ....... 548.771104: #402268 context thread timer_latency 39960 ns + +In this case, the root cause of the timer latency does not point to a +single cause, but to multiple ones. Firstly, the timer IRQ was delayed +for 13 us, which may point to a long IRQ disabled section (see IRQ +stacktrace section). Then the timer interrupt that wakes up the timerlat +thread took 7597 ns, and the qxl:21 device IRQ took 7139 ns. Finally, +the cc1 thread noise took 9909 ns of time before the context switch. +Such pieces of evidence are useful for the developer to use other +tracing methods to figure out how to debug and optimize the system. + +It is worth mentioning that the *duration* values reported +by the osnoise: events are *net* values. For example, the +thread_noise does not include the duration of the overhead caused +by the IRQ execution (which indeed accounted for 12736 ns). But +the values reported by the timerlat tracer (timerlat_latency) +are *gross* values. + +The art below illustrates a CPU timeline and how the timerlat tracer +observes it at the top and the osnoise: events at the bottom. Each "-" +in the timelines means circa 1 us, and the time moves ==>:: + + External timer irq thread + clock latency latency + event 13585 ns 39960 ns + | ^ ^ + v | | + |-------------| | + |-------------+-------------------------| + ^ ^ + ======================================================================== + [tmr irq] [dev irq] + [another thread...^ v..^ v.......][timerlat/ thread] <-- CPU timeline + ========================================================================= + |-------| |-------| + |--^ v-------| + | | | + | | + thread_noise: 9909 ns + | +-> irq_noise: 6139 ns + +-> irq_noise: 7597 ns + +IRQ stacktrace +--------------------------- + +The osnoise/print_stack option is helpful for the cases in which a thread +noise causes the major factor for the timer latency, because of preempt or +irq disabled. For example:: + + [root@f32 tracing]# echo 500 > osnoise/stop_tracing_total_us + [root@f32 tracing]# echo 500 > osnoise/print_stack + [root@f32 tracing]# echo timerlat > current_tracer + [root@f32 tracing]# tail -21 per_cpu/cpu7/trace + insmod-1026 [007] dN.h1.. 200.201948: irq_noise: local_timer:236 start 200.201939376 duration 7872 ns + insmod-1026 [007] d..h1.. 200.202587: #29800 context irq timer_latency 1616 ns + insmod-1026 [007] dN.h2.. 200.202598: irq_noise: local_timer:236 start 200.202586162 duration 11855 ns + insmod-1026 [007] dN.h3.. 200.202947: irq_noise: local_timer:236 start 200.202939174 duration 7318 ns + insmod-1026 [007] d...3.. 200.203444: thread_noise: insmod:1026 start 200.202586933 duration 838681 ns + timerlat/7-1001 [007] ....... 200.203445: #29800 context thread timer_latency 859978 ns + timerlat/7-1001 [007] ....1.. 200.203446: <stack trace> + => timerlat_irq + => __hrtimer_run_queues + => hrtimer_interrupt + => __sysvec_apic_timer_interrupt + => asm_call_irq_on_stack + => sysvec_apic_timer_interrupt + => asm_sysvec_apic_timer_interrupt + => delay_tsc + => dummy_load_1ms_pd_init + => do_one_initcall + => do_init_module + => __do_sys_finit_module + => do_syscall_64 + => entry_SYSCALL_64_after_hwframe + +In this case, it is possible to see that the thread added the highest +contribution to the *timer latency* and the stack trace, saved during +the timerlat IRQ handler, points to a function named +dummy_load_1ms_pd_init, which had the following code (on purpose):: + + static int __init dummy_load_1ms_pd_init(void) + { + preempt_disable(); + mdelay(1); + preempt_enable(); + return 0; + + } diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 146ad9eec221..9682ceb1f3df 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -378,6 +378,34 @@ config OSNOISE_TRACER To enable this tracer, echo in "osnoise" into the current_tracer file.
+config TIMERLAT_TRACER + bool "Timerlat tracer" + select OSNOISE_TRACER + select GENERIC_TRACER + help + The timerlat tracer aims to help the preemptive kernel developers + to find sources of wakeup latencies of real-time threads. + + The tracer creates a per-cpu kernel thread with real-time priority. + The tracer thread sets a periodic timer to wakeup itself, and goes + to sleep waiting for the timer to fire. At the wakeup, the thread + then computes a wakeup latency value as the difference between + the current time and the absolute time that the timer was set + to expire. + + The tracer prints two lines at every activation. The first is the + timer latency observed at the hardirq context before the + activation of the thread. The second is the timer latency observed + by the thread, which is the same level that cyclictest reports. The + ACTIVATION ID field serves to relate the irq execution to its + respective thread execution. + + The tracer is build on top of osnoise tracer, and the osnoise: + events can be used to trace the source of interference from NMI, + IRQs and other threads. It also enables the capture of the + stacktrace at the IRQ context, which helps to identify the code + path that can cause thread delay. + config MMIOTRACE bool "Memory mapped IO tracing" depends on HAVE_MMIOTRACE_SUPPORT && PCI diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 75fd887496a1..46c8a6ac9e3c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -45,6 +45,7 @@ enum trace_type { TRACE_BPUTS, TRACE_HWLAT, TRACE_OSNOISE, + TRACE_TIMERLAT, TRACE_RAW_DATA,
__TRACE_LAST_TYPE, @@ -453,6 +454,7 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \ IF_ASSIGN(var, ent, struct hwlat_entry, TRACE_HWLAT); \ IF_ASSIGN(var, ent, struct osnoise_entry, TRACE_OSNOISE);\ + IF_ASSIGN(var, ent, struct timerlat_entry, TRACE_TIMERLAT);\ IF_ASSIGN(var, ent, struct raw_data_entry, TRACE_RAW_DATA);\ IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ TRACE_MMIO_RW); \ diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index f689b7e5c29d..89bc02efe058 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -363,3 +363,19 @@ FTRACE_ENTRY(osnoise, osnoise_entry, __entry->softirq_count, __entry->thread_count) ); + +FTRACE_ENTRY(timerlat, timerlat_entry, + + TRACE_TIMERLAT, + + F_STRUCT( + __field( unsigned int, seqnum ) + __field( int, context ) + __field( u64, timer_latency ) + ), + + F_printk("seq:%u\tcontext:%d\ttimer_latency:%llu\n", + __entry->seqnum, + __entry->context, + __entry->timer_latency) +); diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 4e2c47dc4f19..8546e66bafcb 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * OS Noise Tracer: computes the OS Noise suffered by a running thread. + * Timerlat Tracer: measures the wakeup latency of a timer triggered IRQ and thread. * * Based on "hwlat_detector" tracer by: * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. jcm@redhat.com @@ -21,6 +22,7 @@ #include <linux/cpumask.h> #include <linux/delay.h> #include <linux/sched/clock.h> +#include <uapi/linux/sched/types.h> #include <linux/sched.h> #include "trace.h"
@@ -45,6 +47,9 @@ static struct trace_array *osnoise_trace; #define DEFAULT_SAMPLE_PERIOD 1000000 /* 1s */ #define DEFAULT_SAMPLE_RUNTIME 1000000 /* 1s */
+#define DEFAULT_TIMERLAT_PERIOD 1000 /* 1ms */ +#define DEFAULT_TIMERLAT_PRIO 95 /* FIFO 95 */ + /* * NMI runtime info. */ @@ -62,6 +67,8 @@ struct osn_irq { u64 delta_start; };
+#define IRQ_CONTEXT 0 +#define THREAD_CONTEXT 1 /* * sofirq runtime info. */ @@ -108,32 +115,76 @@ static inline struct osnoise_variables *this_cpu_osn_var(void) return this_cpu_ptr(&per_cpu_osnoise_var); }
+#ifdef CONFIG_TIMERLAT_TRACER /* - * osn_var_reset - Reset the values of the given osnoise_variables + * Runtime information for the timer mode. + */ +struct timerlat_variables { + struct task_struct *kthread; + struct hrtimer timer; + u64 rel_period; + u64 abs_period; + bool tracing_thread; + u64 count; +}; + +DEFINE_PER_CPU(struct timerlat_variables, per_cpu_timerlat_var); + +/* + * this_cpu_tmr_var - Return the per-cpu timerlat_variables on its relative CPU + */ +static inline struct timerlat_variables *this_cpu_tmr_var(void) +{ + return this_cpu_ptr(&per_cpu_timerlat_var); +} + +/* + * tlat_var_reset - Reset the values of the given timerlat_variables */ -static inline void osn_var_reset(struct osnoise_variables *osn_var) +static inline void tlat_var_reset(void) { + struct timerlat_variables *tlat_var; + int cpu; /* * So far, all the values are initialized as 0, so * zeroing the structure is perfect. */ - memset(osn_var, 0, sizeof(*osn_var)); + for_each_cpu(cpu, cpu_online_mask) { + tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu); + memset(tlat_var, 0, sizeof(*tlat_var)); + } } +#else /* CONFIG_TIMERLAT_TRACER */ +#define tlat_var_reset() do {} while (0) +#endif /* CONFIG_TIMERLAT_TRACER */
/* - * osn_var_reset_all - Reset the value of all per-cpu osnoise_variables + * osn_var_reset - Reset the values of the given osnoise_variables */ -static inline void osn_var_reset_all(void) +static inline void osn_var_reset(void) { struct osnoise_variables *osn_var; int cpu;
+ /* + * So far, all the values are initialized as 0, so + * zeroing the structure is perfect. + */ for_each_cpu(cpu, cpu_online_mask) { osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu); - osn_var_reset(osn_var); + memset(osn_var, 0, sizeof(*osn_var)); } }
+/* + * osn_var_reset_all - Reset the value of all per-cpu osnoise_variables + */ +static inline void osn_var_reset_all(void) +{ + osn_var_reset(); + tlat_var_reset(); +} + /* * Tells NMIs to call back to the osnoise tracer to record timestamps. */ @@ -154,6 +205,18 @@ struct osnoise_sample { int thread_count; /* # threads during this sample */ };
+#ifdef CONFIG_TIMERLAT_TRACER +/* + * timerlat sample structure definition. Used to store the statistics of + * a sample run. + */ +struct timerlat_sample { + u64 timer_latency; /* timer_latency */ + unsigned int seqnum; /* unique sequence */ + int context; /* timer context */ +}; +#endif + /* * Protect the interface. */ @@ -165,14 +228,24 @@ struct mutex interface_lock; static struct osnoise_data { u64 sample_period; /* total sampling period */ u64 sample_runtime; /* active sampling portion of period */ - u64 stop_tracing; /* stop trace in the inside operation (loop) */ - u64 stop_tracing_total; /* stop trace in the outside operation (report) */ + u64 stop_tracing; /* stop trace in the internal operation (loop/irq) */ + u64 stop_tracing_total; /* stop trace in the final operation (report/thread) */ +#ifdef CONFIG_TIMERLAT_TRACER + u64 timerlat_period; /* timerlat period */ + u64 print_stack; /* print IRQ stack if total > */ + int timerlat_tracer; /* timerlat tracer */ +#endif bool tainted; /* infor users and developers about a problem */ } osnoise_data = { .sample_period = DEFAULT_SAMPLE_PERIOD, .sample_runtime = DEFAULT_SAMPLE_RUNTIME, .stop_tracing = 0, .stop_tracing_total = 0, +#ifdef CONFIG_TIMERLAT_TRACER + .print_stack = 0, + .timerlat_period = DEFAULT_TIMERLAT_PERIOD, + .timerlat_tracer = 0, +#endif };
/* @@ -246,6 +319,128 @@ static void trace_osnoise_sample(struct osnoise_sample *sample) trace_buffer_unlock_commit_nostack(buffer, event); }
+#ifdef CONFIG_TIMERLAT_TRACER +/* + * Print the timerlat header info. + */ +static void print_timerlat_headers(struct seq_file *s) +{ + seq_puts(s, "# _-----=> irqs-off\n"); + seq_puts(s, "# / _----=> need-resched\n"); + seq_puts(s, "# | / _---=> hardirq/softirq\n"); + seq_puts(s, "# || / _--=> preempt-depth\n"); + seq_puts(s, "# || /\n"); + seq_puts(s, "# |||| ACTIVATION\n"); + seq_puts(s, "# TASK-PID CPU# |||| TIMESTAMP ID "); + seq_puts(s, " CONTEXT LATENCY\n"); + seq_puts(s, "# | | | |||| | | "); + seq_puts(s, " | |\n"); +} + +/* + * Record an timerlat_sample into the tracer buffer. + */ +static void trace_timerlat_sample(struct timerlat_sample *sample) +{ + struct trace_array *tr = osnoise_trace; + struct trace_event_call *call = &event_osnoise; + struct trace_buffer *buffer = tr->array_buffer.buffer; + struct ring_buffer_event *event; + struct timerlat_entry *entry; + + event = trace_buffer_lock_reserve(buffer, TRACE_TIMERLAT, sizeof(*entry), + tracing_gen_ctx()); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->seqnum = sample->seqnum; + entry->context = sample->context; + entry->timer_latency = sample->timer_latency; + + if (!call_filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit_nostack(buffer, event); +} + +#ifdef CONFIG_STACKTRACE + +#define MAX_CALLS 256 + +/* + * Stack trace will take place only at IRQ level, so, no need + * to control nesting here. + */ +struct trace_stack { + int stack_size; + int nr_entries; + unsigned long calls[MAX_CALLS]; +}; + +static DEFINE_PER_CPU(struct trace_stack, trace_stack); + +/* + * timerlat_save_stack - save a stack trace without printing + * + * Save the current stack trace without printing. The + * stack will be printed later, after the end of the measurement. + */ +static void timerlat_save_stack(int skip) +{ + unsigned int size, nr_entries; + struct trace_stack *fstack; + + fstack = this_cpu_ptr(&trace_stack); + + size = ARRAY_SIZE(fstack->calls); + + nr_entries = stack_trace_save(fstack->calls, size, skip); + + fstack->stack_size = nr_entries * sizeof(unsigned long); + fstack->nr_entries = nr_entries; + + return; + +} +/* + * timerlat_dump_stack - dump a stack trace previously saved + * + * Dump a saved stack trace into the trace buffer. + */ +static void timerlat_dump_stack(void) +{ + struct trace_event_call *call = &event_osnoise; + struct trace_array *tr = osnoise_trace; + struct trace_buffer *buffer = tr->array_buffer.buffer; + struct ring_buffer_event *event; + struct trace_stack *fstack; + struct stack_entry *entry; + unsigned int size; + + preempt_disable_notrace(); + fstack = this_cpu_ptr(&trace_stack); + size = fstack->stack_size; + + event = trace_buffer_lock_reserve(buffer, TRACE_STACK, sizeof(*entry) + size, + tracing_gen_ctx()); + if (!event) + goto out; + + entry = ring_buffer_event_data(event); + + memcpy(&entry->caller, fstack->calls, size); + entry->size = fstack->nr_entries; + + if (!call_filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit_nostack(buffer, event); + +out: + preempt_enable_notrace(); +} +#else +#define timerlat_dump_stack() do {} while (0) +#define timerlat_save_stack(a) do {} while (0) +#endif /* CONFIG_STACKTRACE */ +#endif /* CONFIG_TIMERLAT_TRACER */ + /* * Macros to encapsulate the time capturing infrastructure. */ @@ -387,6 +582,30 @@ set_int_safe_time(struct osnoise_variables *osn_var, u64 *time) return int_counter; }
+#ifdef CONFIG_TIMERLAT_TRACER +/* + * copy_int_safe_time - Copy *src into *desc aware of interference + */ +static u64 +copy_int_safe_time(struct osnoise_variables *osn_var, u64 *dst, u64 *src) +{ + u64 int_counter; + + do { + int_counter = local_read(&osn_var->int_counter); + /* synchronize with interrupts */ + barrier(); + + *dst = *src; + + /* synchronize with interrupts */ + barrier(); + } while (int_counter != local_read(&osn_var->int_counter)); + + return int_counter; +} +#endif /* CONFIG_TIMERLAT_TRACER */ + /* * trace_osnoise_callback - NMI entry/exit callback * @@ -597,6 +816,22 @@ void trace_softirq_exit_callback(void *data, unsigned int vec_nr) if (!osn_var->sampling) return;
+#ifdef CONFIG_TIMERLAT_TRACER + /* + * If the timerlat is enabled, but the irq handler did + * not run yet enabling timerlat_tracer, do not trace. + */ + if (unlikely(osnoise_data.timerlat_tracer)) { + struct timerlat_variables *tlat_var; + tlat_var = this_cpu_tmr_var(); + if (!tlat_var->tracing_thread) { + osn_var->softirq.arrival_time = 0; + osn_var->softirq.delta_start = 0; + return; + } + } +#endif + duration = get_int_safe_duration(osn_var, &osn_var->softirq.delta_start); trace_softirq_noise(vec_nr, osn_var->softirq.arrival_time, duration); cond_move_thread_delta_start(osn_var, duration); @@ -689,6 +924,18 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t) if (!osn_var->sampling) return;
+#ifdef CONFIG_TIMERLAT_TRACER + if (osnoise_data.timerlat_tracer) { + struct timerlat_variables *tlat_var; + tlat_var = this_cpu_tmr_var(); + if (!tlat_var->tracing_thread) { + osn_var->thread.delta_start = 0; + osn_var->thread.arrival_time = 0; + return; + } + } +#endif + duration = get_int_safe_duration(osn_var, &osn_var->thread.delta_start);
trace_thread_noise(t, osn_var->thread.arrival_time, duration); @@ -979,6 +1226,195 @@ static int osnoise_main(void *data) return 0; }
+#ifdef CONFIG_TIMERLAT_TRACER +/* + * timerlat_irq - hrtimer handler for timerlat. + */ +static enum hrtimer_restart timerlat_irq(struct hrtimer *timer) +{ + struct osnoise_variables *osn_var = this_cpu_osn_var(); + struct trace_array *tr = osnoise_trace; + struct timerlat_variables *tlat; + struct timerlat_sample s; + u64 now; + u64 diff; + + /* + * I am not sure if the timer was armed for this CPU. So, get + * the timerlat struct from the timer itself, not from this + * CPU. + */ + tlat = container_of(timer, struct timerlat_variables, timer); + + now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer)); + + /* + * Enable the osnoise: events for thread an softirq. + */ + tlat->tracing_thread = true; + + osn_var->thread.arrival_time = time_get(); + + /* + * A hardirq is running: the timer IRQ. It is for sure preempting + * a thread, and potentially preempting a softirq. + * + * At this point, it is not interesting to know the duration of the + * preempted thread (and maybe softirq), but how much time they will + * delay the beginning of the execution of the timer thread. + * + * To get the correct (net) delay added by the softirq, its delta_start + * is set as the IRQ one. In this way, at the return of the IRQ, the delta + * start of the sofitrq will be zeroed, accounting then only the time + * after that. + * + * The thread follows the same principle. However, if a softirq is + * running, the thread needs to receive the softirq delta_start. The + * reason being is that the softirq will be the last to be unfolded, + * resseting the thread delay to zero. + */ +#ifndef CONFIG_PREEMPT_RT + if (osn_var->softirq.delta_start) { + copy_int_safe_time(osn_var, &osn_var->thread.delta_start, + &osn_var->softirq.delta_start); + + copy_int_safe_time(osn_var, &osn_var->softirq.delta_start, + &osn_var->irq.delta_start); + } else { + copy_int_safe_time(osn_var, &osn_var->thread.delta_start, + &osn_var->irq.delta_start); + } +#else /* CONFIG_PREEMPT_RT */ + /* + * The sofirqs run as threads on RT, so there is not need + * to keep track of it. + */ + copy_int_safe_time(osn_var, &osn_var->thread.delta_start, &osn_var->irq.delta_start); +#endif /* CONFIG_PREEMPT_RT */ + + /* + * Compute the current time with the expected time. + */ + diff = now - tlat->abs_period; + + tlat->count++; + s.seqnum = tlat->count; + s.timer_latency = diff; + s.context = IRQ_CONTEXT; + + trace_timerlat_sample(&s); + + /* Keep a running maximum ever recorded os noise "latency" */ + if (diff > tr->max_latency) { + tr->max_latency = diff; + latency_fsnotify(tr); + } + + if (osnoise_data.stop_tracing) + if (time_to_us(diff) >= osnoise_data.stop_tracing) + osnoise_stop_tracing(); + + wake_up_process(tlat->kthread); + + if (osnoise_data.print_stack) + timerlat_save_stack(0); + + return HRTIMER_NORESTART; +} + +/* + * wait_next_period - Wait for the next period for timerlat + */ +static int wait_next_period(struct timerlat_variables *tlat) +{ + ktime_t next_abs_period, now; + u64 rel_period = osnoise_data.timerlat_period * 1000; + + now = hrtimer_cb_get_time(&tlat->timer); + next_abs_period = ns_to_ktime(tlat->abs_period + rel_period); + + /* + * Save the next abs_period. + */ + tlat->abs_period = (u64) ktime_to_ns(next_abs_period); + + /* + * If the new abs_period is in the past, skip the activation. + */ + while (ktime_compare(now, next_abs_period) > 0) { + next_abs_period = ns_to_ktime(tlat->abs_period + rel_period); + tlat->abs_period = (u64) ktime_to_ns(next_abs_period); + } + + set_current_state(TASK_INTERRUPTIBLE); + + hrtimer_start(&tlat->timer, next_abs_period, HRTIMER_MODE_ABS_PINNED_HARD); + schedule(); + return 1; +} + +/* + * timerlat_main- Timerlat main + */ +static int timerlat_main(void *data) +{ + struct osnoise_variables *osn_var = this_cpu_osn_var(); + struct timerlat_variables *tlat = this_cpu_tmr_var(); + struct timerlat_sample s; + struct sched_param sp; + u64 now, diff; + + /* + * Make the thread RT, that is how cyclictest is usually used. + */ + sp.sched_priority = DEFAULT_TIMERLAT_PRIO; + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); + + tlat->count = 0; + tlat->tracing_thread = false; + + hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); + tlat->timer.function = timerlat_irq; + tlat->kthread = current; + osn_var->pid = current->pid; + /* + * Anotate the arrival time. + */ + tlat->abs_period = hrtimer_cb_get_time(&tlat->timer); + + wait_next_period(tlat); + + osn_var->sampling = 1; + + while (!kthread_should_stop()) { + now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer)); + diff = now - tlat->abs_period; + + s.seqnum = tlat->count; + s.timer_latency = diff; + s.context = THREAD_CONTEXT; + + trace_timerlat_sample(&s); + +#ifdef CONFIG_STACKTRACE + if (osnoise_data.print_stack) + if (osnoise_data.print_stack <= time_to_us(diff)) + timerlat_dump_stack(); +#endif /* CONFIG_STACKTRACE */ + + tlat->tracing_thread = false; + if (osnoise_data.stop_tracing_total) + if (time_to_us(diff) >= osnoise_data.stop_tracing_total) + osnoise_stop_tracing(); + + wait_next_period(tlat); + } + + hrtimer_cancel(&tlat->timer); + return 0; +} +#endif /* CONFIG_TIMERLAT_TRACER */ + /* * stop_per_cpu_kthread - stop per-cpu threads * @@ -1009,6 +1445,7 @@ static int start_per_cpu_kthreads(struct trace_array *tr) struct cpumask *current_mask = &save_cpumask; struct task_struct *kthread; char comm[24]; + void *main = osnoise_main; int cpu;
get_online_cpus(); @@ -1026,9 +1463,17 @@ static int start_per_cpu_kthreads(struct trace_array *tr) per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL;
for_each_cpu(cpu, current_mask) { +#ifdef CONFIG_TIMERLAT_TRACER + if (osnoise_data.timerlat_tracer) { + snprintf(comm, 24, "timerlat/%d", cpu); + main = timerlat_main; + } else { + snprintf(comm, 24, "osnoise/%d", cpu); + } +#else snprintf(comm, 24, "osnoise/%d", cpu); - - kthread = kthread_create_on_cpu(osnoise_main, NULL, cpu, comm); +#endif + kthread = kthread_create_on_cpu(main, NULL, cpu, comm);
if (IS_ERR(kthread)) { pr_err(BANNER "could not start sampling thread\n"); @@ -1194,6 +1639,31 @@ static struct trace_min_max_param osnoise_stop_tracing_total = { .min = NULL, };
+#ifdef CONFIG_TIMERLAT_TRACER +/* + * osnoise/print_stack: print the stacktrace of the IRQ handler if the total + * latency is higher than val. + */ +static struct trace_min_max_param osnoise_print_stack = { + .lock = &interface_lock, + .val = &osnoise_data.print_stack, + .max = NULL, + .min = NULL, +}; + +/* + * osnoise/timerlat_period: min 100 us, max 1 s + */ +u64 timerlat_min_period = 100; +u64 timerlat_max_period = 1000000; +static struct trace_min_max_param timerlat_period = { + .lock = &interface_lock, + .val = &osnoise_data.timerlat_period, + .max = &timerlat_max_period, + .min = &timerlat_min_period, +}; +#endif + static const struct file_operations cpus_fops = { .open = tracing_open_generic, .read = osnoise_cpus_read, @@ -1204,10 +1674,9 @@ static const struct file_operations cpus_fops = { /* * init_tracefs - A function to initialize the tracefs interface files * - * This function creates entries in tracefs for "osnoise". It creates the - * "osnoise" directory in the tracing directory, and within that - * directory is the count, runtime and period files to change and view - * those values. + * This function creates entries in tracefs for "osnoise" and "timerlat". + * It creates these directories in the tracing directory, and within that + * directory the use can change and view the configs. */ static int init_tracefs(void) { @@ -1221,7 +1690,7 @@ static int init_tracefs(void)
top_dir = tracefs_create_dir("osnoise", NULL); if (!top_dir) - return -ENOMEM; + return 0;
tmp = tracefs_create_file("period_us", 0640, top_dir, &osnoise_period, &trace_min_max_fops); @@ -1246,6 +1715,19 @@ static int init_tracefs(void) tmp = trace_create_file("cpus", 0644, top_dir, NULL, &cpus_fops); if (!tmp) goto err; +#ifdef CONFIG_TIMERLAT_TRACER +#ifdef CONFIG_STACKTRACE + tmp = tracefs_create_file("print_stack", 0640, top_dir, + &osnoise_print_stack, &trace_min_max_fops); + if (!tmp) + goto err; +#endif + + tmp = tracefs_create_file("timerlat_period_us", 0640, top_dir, + &timerlat_period, &trace_min_max_fops); + if (!tmp) + goto err; +#endif
return 0;
@@ -1286,18 +1768,15 @@ static int osnoise_hook_events(void) return -EINVAL; }
-static void osnoise_tracer_start(struct trace_array *tr) +static int __osnoise_tracer_start(struct trace_array *tr) { int retval;
- if (osnoise_busy) - return; - osn_var_reset_all();
retval = osnoise_hook_events(); if (retval) - goto out_err; + return retval; /* * Make sure NMIs see reseted values. */ @@ -1305,15 +1784,27 @@ static void osnoise_tracer_start(struct trace_array *tr) trace_osnoise_callback_enabled = true;
retval = start_per_cpu_kthreads(tr); - /* - * all fine! - */ - if (!retval) + if (retval) { + unhook_irq_events(); + return retval; + } + + osnoise_busy = true; + + return 0; +} + +static void osnoise_tracer_start(struct trace_array *tr) +{ + int retval; + + if (osnoise_busy) return;
-out_err: - unhook_irq_events(); - pr_err(BANNER "Error starting osnoise tracer\n"); + retval = __osnoise_tracer_start(tr); + if (retval) + pr_err(BANNER "Error starting osnoise tracer\n"); + }
static void osnoise_tracer_stop(struct trace_array *tr) @@ -1335,18 +1826,16 @@ static void osnoise_tracer_stop(struct trace_array *tr)
static int osnoise_tracer_init(struct trace_array *tr) { + /* Only allow one instance to enable this */ if (osnoise_busy) return -EBUSY;
osnoise_trace = tr; - tr->max_latency = 0;
osnoise_tracer_start(tr);
- osnoise_busy = true; - return 0; }
@@ -1365,6 +1854,71 @@ static struct tracer osnoise_tracer __read_mostly = { .allow_instances = true, };
+#ifdef CONFIG_TIMERLAT_TRACER +static void timerlat_tracer_start(struct trace_array *tr) +{ + int retval; + + if (osnoise_busy) + return; + + osnoise_data.timerlat_tracer = 1; + + retval = __osnoise_tracer_start(tr); + if (retval) + goto out_err; + + return; +out_err: + pr_err(BANNER "Error starting timerlat tracer\n"); +} + +static void timerlat_tracer_stop(struct trace_array *tr) +{ + int cpu; + + if (!osnoise_busy) + return; + + for_each_online_cpu(cpu) + per_cpu(per_cpu_osnoise_var, cpu).sampling = 0; + + osnoise_tracer_stop(tr); + + osnoise_data.timerlat_tracer = 0; +} + +static int timerlat_tracer_init(struct trace_array *tr) +{ + /* Only allow one instance to enable this */ + if (osnoise_busy) + return -EBUSY; + + osnoise_trace = tr; + + tr->max_latency = 0; + + timerlat_tracer_start(tr); + + return 0; +} + +static void timerlat_tracer_reset(struct trace_array *tr) +{ + timerlat_tracer_stop(tr); +} + +static struct tracer timerlat_tracer __read_mostly = { + .name = "timerlat", + .init = timerlat_tracer_init, + .reset = timerlat_tracer_reset, + .start = timerlat_tracer_start, + .stop = timerlat_tracer_stop, + .print_header = print_timerlat_headers, + .allow_instances = true, +}; +#endif /* CONFIG_TIMERLAT_TRACER */ + __init static int init_osnoise_tracer(void) { int ret; @@ -1374,8 +1928,18 @@ __init static int init_osnoise_tracer(void) cpumask_copy(&osnoise_cpumask, cpu_all_mask);
ret = register_tracer(&osnoise_tracer); - if (ret) + if (ret) { + pr_err(BANNER "Error registering osnoise!\n"); return ret; + } + +#ifdef CONFIG_TIMERLAT_TRACER + ret = register_tracer(&timerlat_tracer); + if (ret) { + pr_err(BANNER "Error registering timerlat\n"); + return ret; + } +#endif
init_tracefs();
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index f1dce388121a..7042544c5bde 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1278,6 +1278,52 @@ static struct trace_event trace_osnoise_event = { .funcs = &trace_osnoise_funcs, };
+/* TRACE_TIMERLAT */ +static enum print_line_t +trace_timerlat_print(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + struct timerlat_entry *field; + + trace_assign_type(field, entry); + + trace_seq_printf(s, "#%-5u context %6s timer_latency %9llu ns\n", + field->seqnum, + field->context ? "thread" : "irq", + field->timer_latency); + + return trace_handle_return(s); +} + +static enum print_line_t +trace_timerlat_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct timerlat_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + trace_seq_printf(s, "%u %d %llu\n", + field->seqnum, + field->context, + field->timer_latency); + + return trace_handle_return(s); +} + +static struct trace_event_functions trace_timerlat_funcs = { + .trace = trace_timerlat_print, + .raw = trace_timerlat_raw, +}; + +static struct trace_event trace_timerlat_event = { + .type = TRACE_TIMERLAT, + .funcs = &trace_timerlat_funcs, +}; + /* TRACE_BPUTS */ static enum print_line_t trace_bputs_print(struct trace_iterator *iter, int flags, @@ -1444,6 +1490,7 @@ static struct trace_event *events[] __initdata = { &trace_print_event, &trace_hwlat_event, &trace_osnoise_event, + &trace_timerlat_event, &trace_raw_data_event, NULL };
From: Daniel Bristot de Oliveira bristot@kernel.org
mainline inclusion from mainline-v5.14-rc7 commit d03721a6e7e8c04261873b3840daa3ce2c5b0543 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4G64B CVE: NA
-------------------------------------------------
Some extra flags are printed to the trace header when using the PREEMPT_RT config. The extra flags are: need-resched-lazy, preempt-lazy-depth, and migrate-disable.
Without printing these fields, the osnoise specific fields are shifted by three positions, for example:
# tracer: osnoise # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth MAX # || / SINGLE Interference counters: # |||| RUNTIME NOISE %% OF CPU NOISE +-----------------------------+ # TASK-PID CPU# |||| TIMESTAMP IN US IN US AVAILABLE IN US HW NMI IRQ SIRQ THREAD # | | | |||| | | | | | | | | | | <...>-741 [000] ....... 1105.690909: 1000000 234 99.97660 36 21 0 1001 22 3 <...>-742 [001] ....... 1105.691923: 1000000 281 99.97190 197 7 0 1012 35 14 <...>-743 [002] ....... 1105.691958: 1000000 1324 99.86760 118 11 0 1016 155 143 <...>-744 [003] ....... 1105.691998: 1000000 109 99.98910 21 4 0 1004 33 7 <...>-745 [004] ....... 1105.692015: 1000000 2023 99.79770 97 37 0 1023 52 18
Add a new header for osnoise with the missing fields, to be used when the PREEMPT_RT is enabled.
Link: https://lkml.kernel.org/r/1f03289d2a51fde5a58c2e7def063dc630820ad1.162659884...
Cc: Tom Zanussi zanussi@kernel.org Cc: Namhyung Kim namhyung@kernel.org Cc: Masami Hiramatsu mhiramat@kernel.org Signed-off-by: Daniel Bristot de Oliveira bristot@kernel.org Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/trace/trace_osnoise.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 8546e66bafcb..c1f001188b0f 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -253,10 +253,40 @@ static struct osnoise_data { */ static bool osnoise_busy;
+#ifdef CONFIG_PREEMPT_RT /* * Print the osnoise header info. */ static void print_osnoise_headers(struct seq_file *s) +{ + if (osnoise_data.tainted) + seq_puts(s, "# osnoise is tainted!\n"); + + seq_puts(s, "# _-------=> irqs-off\n"); + seq_puts(s, "# / _------=> need-resched\n"); + seq_puts(s, "# | / _-----=> need-resched-lazy\n"); + seq_puts(s, "# || / _----=> hardirq/softirq\n"); + seq_puts(s, "# ||| / _---=> preempt-depth\n"); + seq_puts(s, "# |||| / _--=> preempt-lazy-depth\n"); + seq_puts(s, "# ||||| / _-=> migrate-disable\n"); + + seq_puts(s, "# |||||| / "); + seq_puts(s, " MAX\n"); + + seq_puts(s, "# ||||| / "); + seq_puts(s, " SINGLE Interference counters:\n"); + + seq_puts(s, "# ||||||| RUNTIME "); + seq_puts(s, " NOISE %% OF CPU NOISE +-----------------------------+\n"); + + seq_puts(s, "# TASK-PID CPU# ||||||| TIMESTAMP IN US "); + seq_puts(s, " IN US AVAILABLE IN US HW NMI IRQ SIRQ THREAD\n"); + + seq_puts(s, "# | | | ||||||| | | "); + seq_puts(s, " | | | | | | | |\n"); +} +#else /* CONFIG_PREEMPT_RT */ +static void print_osnoise_headers(struct seq_file *s) { if (osnoise_data.tainted) seq_puts(s, "# osnoise is tainted!\n"); @@ -279,6 +309,7 @@ static void print_osnoise_headers(struct seq_file *s) seq_puts(s, "# | | | |||| | | "); seq_puts(s, " | | | | | | | |\n"); } +#endif /* CONFIG_PREEMPT_RT */
/* * osnoise_taint - report an osnoise error.
From: Daniel Bristot de Oliveira bristot@kernel.org
mainline inclusion from mainline-v5.16-rc1 commit 9bd985766a43ac0115f13f67783d381ebcba70c6 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4G64B CVE: NA
-------------------------------------------------
s/CONFIG_OSNOISE_TRAECR/CONFIG_OSNOISE_TRACER/
No functional changes.
Link: https://lkml.kernel.org/r/33924a16f6e5559ce24952ca7d62561604bfd94a.163430838...
Cc: Daniel Bristot de Oliveira bristot@kernel.org Cc: Jonathan Corbet corbet@lwn.net Cc: Thomas Gleixner tglx@linutronix.de Cc: Ingo Molnar mingo@redhat.com Cc: Borislav Petkov bp@alien8.de Cc: "H. Peter Anvin" hpa@zytor.com Cc: Sebastian Andrzej Siewior bigeasy@linutronix.de Cc: x86@kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira bristot@kernel.org Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/kernel/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/kernel/trace.c b/arch/x86/kernel/trace.c index 6912672c33a7..ebcde0f86769 100644 --- a/arch/x86/kernel/trace.c +++ b/arch/x86/kernel/trace.c @@ -234,4 +234,4 @@ void osnoise_arch_unregister(void) unregister_trace_local_timer_exit(trace_intel_irq_exit, "local_timer"); unregister_trace_local_timer_entry(trace_intel_irq_entry, NULL); } -#endif /* CONFIG_OSNOISE_TRAECR && CONFIG_X86_LOCAL_APIC */ +#endif /* CONFIG_OSNOISE_TRACER && CONFIG_X86_LOCAL_APIC */
From: Daniel Bristot de Oliveira bristot@redhat.com
mainline inclusion from mainline-v5.14-rc1 commit f7d9f6370e006400655ff96cb148f56598492d91 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4G64B CVE: NA
-------------------------------------------------
kernel test robot reported some osnoise functions with "no previous prototype."
Fix these warnings by making local functions static, and by adding:
void osnoise_trace_irq_entry(int id); void osnoise_trace_irq_exit(int id, const char *desc);
to include/linux/trace.h.
Link: https://lkml.kernel.org/r/e40d3cb4be8bde921f4b40fa6a095cf85ab807bd.162487260...
Fixes: bce29ac9ce0b ("trace: Add osnoise tracer") Reported-by: kernel test robot lkp@intel.com Signed-off-by: Daniel Bristot de Oliveira bristot@redhat.com Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/kernel/trace.c | 3 --- include/linux/trace.h | 2 ++ kernel/trace/trace_osnoise.c | 20 +++++++++++--------- 3 files changed, 13 insertions(+), 12 deletions(-)
diff --git a/arch/x86/kernel/trace.c b/arch/x86/kernel/trace.c index ebcde0f86769..8322e8352777 100644 --- a/arch/x86/kernel/trace.c +++ b/arch/x86/kernel/trace.c @@ -2,9 +2,6 @@ #include <linux/trace.h>
#if defined(CONFIG_OSNOISE_TRACER) && defined(CONFIG_X86_LOCAL_APIC) -extern void osnoise_trace_irq_entry(int id); -extern void osnoise_trace_irq_exit(int id, const char *desc); - /* * trace_intel_irq_entry - record intel specific IRQ entry */ diff --git a/include/linux/trace.h b/include/linux/trace.h index 148c6b7e0ce6..eeb81971b3ce 100644 --- a/include/linux/trace.h +++ b/include/linux/trace.h @@ -44,6 +44,8 @@ int trace_array_destroy(struct trace_array *tr); /* For osnoise tracer */ int osnoise_arch_register(void); void osnoise_arch_unregister(void); +void osnoise_trace_irq_entry(int id); +void osnoise_trace_irq_exit(int id, const char *desc);
#endif /* CONFIG_TRACING */
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index c1f001188b0f..5ea4c7e6f888 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -767,7 +767,7 @@ void __weak osnoise_arch_unregister(void) * This function hooks the IRQ related callbacks to the respective trace * events. */ -int hook_irq_events(void) +static int hook_irq_events(void) { int ret;
@@ -799,7 +799,7 @@ int hook_irq_events(void) * This function unhooks the IRQ related callbacks to the respective trace * events. */ -void unhook_irq_events(void) +static void unhook_irq_events(void) { osnoise_arch_unregister(); unregister_trace_irq_handler_exit(trace_irqexit_callback, NULL); @@ -816,7 +816,7 @@ void unhook_irq_events(void) * arrival time. The delta_start is used to compute the duration at the * softirq exit handler. See cond_move_softirq_delta_start(). */ -void trace_softirq_entry_callback(void *data, unsigned int vec_nr) +static void trace_softirq_entry_callback(void *data, unsigned int vec_nr) { struct osnoise_variables *osn_var = this_cpu_osn_var();
@@ -839,7 +839,7 @@ void trace_softirq_entry_callback(void *data, unsigned int vec_nr) * Computes the duration of the softirq noise, and trace it. Also discounts the * interference from other sources of noise could be currently being accounted. */ -void trace_softirq_exit_callback(void *data, unsigned int vec_nr) +static void trace_softirq_exit_callback(void *data, unsigned int vec_nr) { struct osnoise_variables *osn_var = this_cpu_osn_var(); int duration; @@ -980,7 +980,7 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t) * This function is hooked to the sched:sched_switch trace event, and it is * used to record the beginning and to report the end of a thread noise window. */ -void +static void trace_sched_switch_callback(void *data, bool preempt, struct task_struct *p, struct task_struct *n) { @@ -999,7 +999,7 @@ trace_sched_switch_callback(void *data, bool preempt, struct task_struct *p, * Hook the osnoise tracer callbacks to handle the noise from other * threads on the necessary kernel events. */ -int hook_thread_events(void) +static int hook_thread_events(void) { int ret;
@@ -1016,7 +1016,7 @@ int hook_thread_events(void) * Unook the osnoise tracer callbacks to handle the noise from other * threads on the necessary kernel events. */ -void unhook_thread_events(void) +static void unhook_thread_events(void) { unregister_trace_sched_switch(trace_sched_switch_callback, NULL); } @@ -1028,7 +1028,8 @@ void unhook_thread_events(void) * values will be used later to compute the diff betwneen the statistics * before and after the osnoise sampling. */ -void save_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *s) +static void +save_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *s) { s->nmi_count = osn_var->nmi.count; s->irq_count = osn_var->irq.count; @@ -1043,7 +1044,8 @@ void save_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sam * statistics. The struct osnoise_sample *s contains the statistics saved via * save_osn_sample_stats() before the osnoise sampling. */ -void diff_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *s) +static void +diff_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *s) { s->nmi_count = osn_var->nmi.count - s->nmi_count; s->irq_count = osn_var->irq.count - s->irq_count;
From: Daniel Bristot de Oliveira bristot@redhat.com
mainline inclusion from mainline-v5.14-rc1 commit c8895e271f7994a3ecb13b8a280e39aa53879545 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4G64B CVE: NA
-------------------------------------------------
Enable and disable osnoise/timerlat thread during on CPU hotplug online and offline operations respectivelly.
Link: https://lore.kernel.org/linux-doc/20210621134636.5b332226@oasis.local.home/ Link: https://lkml.kernel.org/r/39f98590b3caeb3c32f09526214058efe0e9272a.162437231...
Cc: Phil Auld pauld@redhat.com Cc: Sebastian Andrzej Siewior bigeasy@linutronix.de Cc: Kate Carcia kcarcia@redhat.com Cc: Jonathan Corbet corbet@lwn.net Cc: Ingo Molnar mingo@redhat.com Cc: Peter Zijlstra peterz@infradead.org Cc: Thomas Gleixner tglx@linutronix.de Cc: Alexandre Chartre alexandre.chartre@oracle.com Cc: Clark Willaims williams@redhat.com Cc: John Kacur jkacur@redhat.com Cc: Juri Lelli juri.lelli@redhat.com Cc: Borislav Petkov bp@alien8.de Cc: "H. Peter Anvin" hpa@zytor.com Cc: x86@kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Suggested-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Daniel Bristot de Oliveira bristot@redhat.com Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/trace/trace_osnoise.c | 165 ++++++++++++++++++++++++++++------- 1 file changed, 135 insertions(+), 30 deletions(-)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 5ea4c7e6f888..35e9fb9edfb5 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1449,22 +1449,67 @@ static int timerlat_main(void *data) #endif /* CONFIG_TIMERLAT_TRACER */
/* - * stop_per_cpu_kthread - stop per-cpu threads + * stop_kthread - stop a workload thread + */ +static void stop_kthread(unsigned int cpu) +{ + struct task_struct *kthread; + + kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread; + if (kthread) + kthread_stop(kthread); + per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; +} + +/* + * stop_per_cpu_kthread - Stop per-cpu threads * * Stop the osnoise sampling htread. Use this on unload and at system * shutdown. */ static void stop_per_cpu_kthreads(void) { - struct task_struct *kthread; int cpu;
- for_each_online_cpu(cpu) { - kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread; - if (kthread) - kthread_stop(kthread); - per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; + get_online_cpus(); + + for_each_online_cpu(cpu) + stop_kthread(cpu); + + put_online_cpus(); +} + +/* + * start_kthread - Start a workload tread + */ +static int start_kthread(unsigned int cpu) +{ + struct task_struct *kthread; + void *main = osnoise_main; + char comm[24]; + +#ifdef CONFIG_TIMERLAT_TRACER + if (osnoise_data.timerlat_tracer) { + snprintf(comm, 24, "timerlat/%d", cpu); + main = timerlat_main; + } else { + snprintf(comm, 24, "osnoise/%d", cpu); } +#else + snprintf(comm, 24, "osnoise/%d", cpu); +#endif + kthread = kthread_create_on_cpu(main, NULL, cpu, comm); + + if (IS_ERR(kthread)) { + pr_err(BANNER "could not start sampling thread\n"); + stop_per_cpu_kthreads(); + return -ENOMEM; + } + + per_cpu(per_cpu_osnoise_var, cpu).kthread = kthread; + wake_up_process(kthread); + + return 0; }
/* @@ -1476,9 +1521,7 @@ static void stop_per_cpu_kthreads(void) static int start_per_cpu_kthreads(struct trace_array *tr) { struct cpumask *current_mask = &save_cpumask; - struct task_struct *kthread; - char comm[24]; - void *main = osnoise_main; + int retval; int cpu;
get_online_cpus(); @@ -1490,37 +1533,91 @@ static int start_per_cpu_kthreads(struct trace_array *tr) * And the CPU is online. */ cpumask_and(current_mask, cpu_online_mask, current_mask); - put_online_cpus();
- for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL;
for_each_cpu(cpu, current_mask) { -#ifdef CONFIG_TIMERLAT_TRACER - if (osnoise_data.timerlat_tracer) { - snprintf(comm, 24, "timerlat/%d", cpu); - main = timerlat_main; - } else { - snprintf(comm, 24, "osnoise/%d", cpu); - } -#else - snprintf(comm, 24, "osnoise/%d", cpu); -#endif - kthread = kthread_create_on_cpu(main, NULL, cpu, comm); - - if (IS_ERR(kthread)) { - pr_err(BANNER "could not start sampling thread\n"); + retval = start_kthread(cpu); + if (retval) { stop_per_cpu_kthreads(); - return -ENOMEM; + return retval; } - - per_cpu(per_cpu_osnoise_var, cpu).kthread = kthread; - wake_up_process(kthread); }
+ put_online_cpus(); + return 0; }
+#ifdef CONFIG_HOTPLUG_CPU +static void osnoise_hotplug_workfn(struct work_struct *dummy) +{ + struct trace_array *tr = osnoise_trace; + unsigned int cpu = smp_processor_id(); + + + mutex_lock(&trace_types_lock); + + if (!osnoise_busy) + goto out_unlock_trace; + + mutex_lock(&interface_lock); + get_online_cpus(); + + if (!cpumask_test_cpu(cpu, &osnoise_cpumask)) + goto out_unlock; + + if (!cpumask_test_cpu(cpu, tr->tracing_cpumask)) + goto out_unlock; + + start_kthread(cpu); + +out_unlock: + put_online_cpus(); + mutex_unlock(&interface_lock); +out_unlock_trace: + mutex_unlock(&trace_types_lock); +} + +static DECLARE_WORK(osnoise_hotplug_work, osnoise_hotplug_workfn); + +/* + * osnoise_cpu_init - CPU hotplug online callback function + */ +static int osnoise_cpu_init(unsigned int cpu) +{ + schedule_work_on(cpu, &osnoise_hotplug_work); + return 0; +} + +/* + * osnoise_cpu_die - CPU hotplug offline callback function + */ +static int osnoise_cpu_die(unsigned int cpu) +{ + stop_kthread(cpu); + return 0; +} + +static void osnoise_init_hotplug_support(void) +{ + int ret; + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "trace/osnoise:online", + osnoise_cpu_init, osnoise_cpu_die); + if (ret < 0) + pr_warn(BANNER "Error to init cpu hotplug support\n"); + + return; +} +#else /* CONFIG_HOTPLUG_CPU */ +static void osnoise_init_hotplug_support(void) +{ + return 0; +} +#endif /* CONFIG_HOTPLUG_CPU */ + /* * osnoise_cpus_read - Read function for reading the "cpus" file * @filp: The active open file structure @@ -1616,7 +1713,14 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, osnoise_tracer_stop(tr);
mutex_lock(&interface_lock); + /* + * osnoise_cpumask is read by CPU hotplug operations. + */ + get_online_cpus(); + cpumask_copy(&osnoise_cpumask, osnoise_cpumask_new); + + put_online_cpus(); mutex_unlock(&interface_lock);
if (running) @@ -1973,6 +2077,7 @@ __init static int init_osnoise_tracer(void) return ret; } #endif + osnoise_init_hotplug_support();
init_tracefs();
From: Daniel Bristot de Oliveira bristot@redhat.com
mainline inclusion from mainline-v5.14-rc1 commit 2a81afa326fd23add336cfd7e35e1d699d11d9c4 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4G64B CVE: NA
-------------------------------------------------
kernel test robot reported:
kernel/trace/trace_osnoise.c:966:3: warning: comparison of distinct
pointer types ('typeof ((interval)) *' (aka 'long long *') and 'uint64_t *' (aka 'unsigned long long *')) [-Wcompare-distinct-pointer-types] do_div(interval, USEC_PER_MSEC); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ include/asm-generic/div64.h:228:28: note: expanded from macro 'do_div' (void)(((typeof((n)) *)0) == ((uint64_t *)0)); \ ~~~~~~~~~~~~~~~~~~ ^ ~~~~~~~~~~~~~~~
As interval cannot be negative because sample_period >= sample_runtime, making interval u64 on osnoise_main() is enough to fix this problem.
Link: https://lkml.kernel.org/r/4ae1e7780563598563de079a3ef6d4d10b5f5546.162487260...
Fixes: bce29ac9ce0b ("trace: Add osnoise tracer") Reported-by: kernel test robot lkp@intel.com Signed-off-by: Daniel Bristot de Oliveira bristot@redhat.com Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/trace/trace_osnoise.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 35e9fb9edfb5..e1ed353dfdc6 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1233,7 +1233,7 @@ static struct cpumask save_cpumask; */ static int osnoise_main(void *data) { - s64 interval; + u64 interval;
while (!kthread_should_stop()) {
From: Daniel Bristot de Oliveira bristot@redhat.com
mainline inclusion from mainline-v5.14-rc1 commit 498627b4ac85780b9962ed9b5c5abbefd884ef8e category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4G64B CVE: NA
-------------------------------------------------
kernel test robot reported:
kernel/trace/trace_osnoise.c:1584:2: error: void function
'osnoise_init_hotplug_support' should not return a value [-Wreturn-type] return 0;
When !CONFIG_HOTPLUG_CPU.
Fix it problem by removing the return value.
Link: https://lkml.kernel.org/r/c7fc67f1a117cc88bab2e508c898634872795341.162487260...
Fixes: c8895e271f79 ("trace/osnoise: Support hotplug operations") Reported-by: kernel test robot lkp@intel.com Signed-off-by: Daniel Bristot de Oliveira bristot@redhat.com Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/trace/trace_osnoise.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index e1ed353dfdc6..93be0b918eb2 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1614,7 +1614,7 @@ static void osnoise_init_hotplug_support(void) #else /* CONFIG_HOTPLUG_CPU */ static void osnoise_init_hotplug_support(void) { - return 0; + return; } #endif /* CONFIG_HOTPLUG_CPU */
From: Daniel Bristot de Oliveira bristot@redhat.com
mainline inclusion from mainline-v5.14-rc1 commit bd09c0556eca17f55fb09a26b6ed27bedd1b42ef category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4G64B CVE: NA
-------------------------------------------------
s/RUNTIME IN USE/RUNTIME IN US/
Link: https://lkml.kernel.org/r/43e5160422a967218aa651c47f523e8d32d6a59e.162487260...
Fixes: bce29ac9ce0b ("trace: Add osnoise tracer") Signed-off-by: Daniel Bristot de Oliveira bristot@redhat.com Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- Documentation/trace/osnoise-tracer.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/Documentation/trace/osnoise-tracer.rst b/Documentation/trace/osnoise-tracer.rst index 37a3c10fb216..b648cb9bf1f0 100644 --- a/Documentation/trace/osnoise-tracer.rst +++ b/Documentation/trace/osnoise-tracer.rst @@ -77,7 +77,7 @@ In addition to the regular trace fields (from TASK-PID to TIMESTAMP), the tracer prints a message at the end of each period for each CPU that is running an osnoise/ thread. The osnoise specific fields report:
- - The RUNTIME IN USE reports the amount of time in microseconds that + - The RUNTIME IN US reports the amount of time in microseconds that the osnoise thread kept looping reading the time. - The NOISE IN US reports the sum of noise in microseconds observed by the osnoise tracer during the associated runtime.
From: Colin Ian King colin.king@canonical.com
mainline inclusion from mainline-v5.14-rc1 commit b62613b431bdababc90bf1440b2c7427172d94f4 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4G64B CVE: NA
-------------------------------------------------
There is a spelling mistake in a TP_printk message, the word interferences is not the plural of interference. Fix this.
Link: https://lkml.kernel.org/r/20210628125522.56361-1-colin.king@canonical.com
Reviewed-by: Daniel Bristot de Oliveira bristot@redhat.com Signed-off-by: Colin Ian King colin.king@canonical.com Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/trace/events/osnoise.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/trace/events/osnoise.h b/include/trace/events/osnoise.h index 28762c69f6c9..82f741ec0f57 100644 --- a/include/trace/events/osnoise.h +++ b/include/trace/events/osnoise.h @@ -129,7 +129,7 @@ TRACE_EVENT(sample_threshold, __entry->interference = interference; ),
- TP_printk("start %llu.%09u duration %llu ns interferences %llu", + TP_printk("start %llu.%09u duration %llu ns interference %llu", __print_ns_to_secs(__entry->start), __print_ns_without_secs(__entry->start), __entry->duration,
From: Daniel Bristot de Oliveira bristot@redhat.com
mainline inclusion from mainline-v5.14-rc1 commit 19c3eaa72288ce161441dd6b74b765a094d73488 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4G64B CVE: NA
-------------------------------------------------
Dab Carpenter reported that:
The patch bce29ac9ce0b: "trace: Add osnoise tracer" from Jun 22, 2021, leads to the following static checker warning:
kernel/trace/trace_osnoise.c:1103 run_osnoise() warn: unsigned 'noise' is never less than zero.
In this part of the code:
1100 /* 1101 * This shouldn't happen. 1102 */ 1103 if (noise < 0) { ^^^^^^^^^ 1104 osnoise_taint("negative noise!"); 1105 goto out; 1106 } 1107
And the static checker is right because 'noise' is u64.
Make noise s64 and keep the check. It is important to check if the time read is behaving correctly - so we can trust the results.
I also re-arranged some variable declarations.
Link: https://lkml.kernel.org/r/acd7cd6e7d56b798a298c3bc8139a390b3c4ab52.162498636...
Cc: Ingo Molnar mingo@redhat.com Cc: Daniel Bristot de Oliveira bristot@redhat.com Cc: Dan Carpenter dan.carpenter@oracle.com Cc: linux-kernel@vger.kernel.org Fixes: bce29ac9ce0b ("trace: Add osnoise tracer") Reported-by: Dan Carpenter dan.carpenter@oracle.com Signed-off-by: Daniel Bristot de Oliveira bristot@redhat.com Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/trace/trace_osnoise.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 93be0b918eb2..b4b07af1acbc 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1073,15 +1073,16 @@ static void osnoise_stop_tracing(void) static int run_osnoise(void) { struct osnoise_variables *osn_var = this_cpu_osn_var(); - u64 noise = 0, sum_noise = 0, max_noise = 0; struct trace_array *tr = osnoise_trace; u64 start, sample, last_sample; u64 last_int_count, int_count; + s64 noise = 0, max_noise = 0; s64 total, last_total = 0; struct osnoise_sample s; unsigned int threshold; - int hw_count = 0; u64 runtime, stop_in; + u64 sum_noise = 0; + int hw_count = 0; int ret = -1;
/*
From: Daniel Bristot de Oliveira bristot@kernel.org
mainline inclusion from mainline-v5.14-rc7 commit 0e05ba498dd0a19fc12868a9506be0f86cf36912 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4G64B CVE: NA
-------------------------------------------------
When using osnoise/timerlat with stop tracing, sometimes it is not clear in which CPU the stop condition was hit, mainly when using some extra events.
Print a message informing in which CPU the trace stopped, like in the example below:
<idle>-0 [006] d.h. 2932.676616: #1672599 context irq timer_latency 34689 ns <idle>-0 [006] dNh. 2932.676618: irq_noise: local_timer:236 start 2932.676615639 duration 2391 ns <idle>-0 [006] dNh. 2932.676620: irq_noise: virtio0-output.0:47 start 2932.676620180 duration 86 ns <idle>-0 [003] d.h. 2932.676621: #1673374 context irq timer_latency 1200 ns <idle>-0 [006] d... 2932.676623: thread_noise: swapper/6:0 start 2932.676615964 duration 4339 ns <idle>-0 [003] dNh. 2932.676623: irq_noise: local_timer:236 start 2932.676620597 duration 1881 ns <idle>-0 [006] d... 2932.676623: sched_switch: prev_comm=swapper/6 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=timerlat/6 next_pid=852 next_prio=4 timerlat/6-852 [006] .... 2932.676623: #1672599 context thread timer_latency 41931 ns <idle>-0 [003] d... 2932.676623: thread_noise: swapper/3:0 start 2932.676620854 duration 880 ns <idle>-0 [003] d... 2932.676624: sched_switch: prev_comm=swapper/3 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=timerlat/3 next_pid=849 next_prio=4 timerlat/6-852 [006] .... 2932.676624: timerlat_main: stop tracing hit on cpu 6 timerlat/3-849 [003] .... 2932.676624: #1673374 context thread timer_latency 4310 ns
Link: https://lkml.kernel.org/r/b30a0d7542adba019185f44ee648e60e14923b11.162659884...
Cc: Tom Zanussi zanussi@kernel.org Cc: Namhyung Kim namhyung@kernel.org Cc: Masami Hiramatsu mhiramat@kernel.org Signed-off-by: Daniel Bristot de Oliveira bristot@kernel.org Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/trace/trace_osnoise.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index b4b07af1acbc..ff25bb827796 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1056,9 +1056,13 @@ diff_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample * /* * osnoise_stop_tracing - Stop tracing and the tracer. */ -static void osnoise_stop_tracing(void) +static __always_inline void osnoise_stop_tracing(void) { struct trace_array *tr = osnoise_trace; + + trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, + "stop tracing hit on cpu %d\n", smp_processor_id()); + tracer_tracing_off(tr); }
From: "Qiang.Zhang" qiang.zhang@windriver.com
mainline inclusion from mainline-v5.15-rc1 commit 4b6b08f2e45edda4c067ac40833e3c1f84383c0b category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4G64B CVE: NA
-------------------------------------------------
When start_kthread() return error, the cpus_read_unlock() need to be called.
Link: https://lkml.kernel.org/r/20210831022919.27630-1-qiang.zhang@windriver.com
Cc: stable@vger.kernel.org Fixes: c8895e271f79 ("trace/osnoise: Support hotplug operations") Acked-by: Daniel Bristot de Oliveira bristot@kernel.org Signed-off-by: Qiang.Zhang qiang.zhang@windriver.com Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/trace/trace_osnoise.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index ff25bb827796..66b2ed45be3c 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1526,7 +1526,7 @@ static int start_kthread(unsigned int cpu) static int start_per_cpu_kthreads(struct trace_array *tr) { struct cpumask *current_mask = &save_cpumask; - int retval; + int retval = 0; int cpu;
get_online_cpus(); @@ -1546,13 +1546,13 @@ static int start_per_cpu_kthreads(struct trace_array *tr) retval = start_kthread(cpu); if (retval) { stop_per_cpu_kthreads(); - return retval; + break; } }
put_online_cpus();
- return 0; + return retval; }
#ifdef CONFIG_HOTPLUG_CPU
From: Jackie Liu liuyun01@kylinos.cn
mainline inclusion from mainline-v5.15-rc6 commit 424b650f35c77defbb3cbd6e5221d3697af42250 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4G64B CVE: NA
-------------------------------------------------
The compiler warns when the data are actually unused:
kernel/trace/trace.c:1712:13: error: ‘trace_create_maxlat_file’ defined but not used [-Werror=unused-function] 1712 | static void trace_create_maxlat_file(struct trace_array *tr, | ^~~~~~~~~~~~~~~~~~~~~~~~
[Why] CONFIG_HWLAT_TRACER=n, CONFIG_TRACER_MAX_TRACE=n, CONFIG_OSNOISE_TRACER=y gcc report warns.
[How] Now trace_create_maxlat_file will only take effect when CONFIG_HWLAT_TRACER=y or CONFIG_TRACER_MAX_TRACE=y. In fact, after adding osnoise trace, it also needs to take effect.
Link: https://lore.kernel.org/all/c1d9e328-ad7c-920b-6c24-9e1598a6421c@infradead.o... Link: https://lkml.kernel.org/r/20210922025122.3268022-1-liu.yun@linux.dev
Fixes: bce29ac9ce0b ("trace: Add osnoise tracer") Cc: Daniel Bristot de Oliveira bristot@redhat.com Suggested-by: Steven Rostedt rostedt@goodmis.org Reviewed-by: Daniel Bristot de Oliveira bristot@kernel.org Tested-by: Randy Dunlap rdunlap@infradead.org # build-tested Signed-off-by: Jackie Liu liuyun01@kylinos.cn Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/trace/trace.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2573a42e270a..3499324efe20 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1733,16 +1733,15 @@ void latency_fsnotify(struct trace_array *tr) irq_work_queue(&tr->fsnotify_irqwork); }
-/* - * (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \ - * defined(CONFIG_FSNOTIFY) - */ -#else +#elif defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \ + || defined(CONFIG_OSNOISE_TRACER)
#define trace_create_maxlat_file(tr, d_tracer) \ trace_create_file("tracing_max_latency", 0644, d_tracer, \ &tr->max_latency, &tracing_max_lat_fops)
+#else +#define trace_create_maxlat_file(tr, d_tracer) do { } while (0) #endif
#ifdef CONFIG_TRACER_MAX_TRACE @@ -9093,9 +9092,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
create_trace_options_dir(tr);
-#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) trace_create_maxlat_file(tr, d_tracer); -#endif
if (ftrace_create_function_files(tr, d_tracer)) MEM_FAIL(1, "Could not allocate function filter files");
From: Daniel Bristot de Oliveira bristot@kernel.org
mainline inclusion from mainline-v5.16-rc1 commit 66df27f19f7dacae471f7214df5bab93d6f88b5f category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4G64B CVE: NA
-------------------------------------------------
In preparation to support multiple instances, decouple the osnoise/timelat workload from instance-specific tracing_cpumask.
Different instances can have conflicting cpumasks, making osnoise workload management needlessly complex. Osnoise already has its global cpumask.
I also thought about using the first instance mask, but the "first" instance could be removed before the others.
This also fixes the problem that changing the tracing_mask was not re-starting the trace.
Link: https://lkml.kernel.org/r/169a71bcc919ce3ab53ae6f9ca5cde57fffaf9c6.163570289...
Cc: Ingo Molnar mingo@redhat.com Cc: Tom Zanussi zanussi@kernel.org Cc: Masami Hiramatsu mhiramat@kernel.org Cc: Juri Lelli juri.lelli@redhat.com Cc: Clark Williams williams@redhat.com Cc: John Kacur jkacur@redhat.com Cc: Peter Zijlstra peterz@infradead.org Cc: Thomas Gleixner tglx@linutronix.de Cc: Sebastian Andrzej Siewior bigeasy@linutronix.de Cc: Daniel Bristot de Oliveira bristot@kernel.org Cc: linux-rt-users@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira bristot@kernel.org Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/trace/trace_osnoise.c | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 66b2ed45be3c..b3f1c09870ca 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1531,13 +1531,9 @@ static int start_per_cpu_kthreads(struct trace_array *tr)
get_online_cpus(); /* - * Run only on CPUs in which trace and osnoise are allowed to run. + * Run only on online CPUs in which osnoise is allowed to run. */ - cpumask_and(current_mask, tr->tracing_cpumask, &osnoise_cpumask); - /* - * And the CPU is online. - */ - cpumask_and(current_mask, cpu_online_mask, current_mask); + cpumask_and(current_mask, cpu_online_mask, &osnoise_cpumask);
for_each_possible_cpu(cpu) per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; @@ -1558,10 +1554,8 @@ static int start_per_cpu_kthreads(struct trace_array *tr) #ifdef CONFIG_HOTPLUG_CPU static void osnoise_hotplug_workfn(struct work_struct *dummy) { - struct trace_array *tr = osnoise_trace; unsigned int cpu = smp_processor_id();
- mutex_lock(&trace_types_lock);
if (!osnoise_busy) @@ -1573,9 +1567,6 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy) if (!cpumask_test_cpu(cpu, &osnoise_cpumask)) goto out_unlock;
- if (!cpumask_test_cpu(cpu, tr->tracing_cpumask)) - goto out_unlock; - start_kthread(cpu);
out_unlock: @@ -1678,13 +1669,10 @@ static void osnoise_tracer_stop(struct trace_array *tr); * interface to the osnoise trace. By default, it lists all CPUs, * in this way, allowing osnoise threads to run on any online CPU * of the system. It serves to restrict the execution of osnoise to the - * set of CPUs writing via this interface. Note that osnoise also - * respects the "tracing_cpumask." Hence, osnoise threads will run only - * on the set of CPUs allowed here AND on "tracing_cpumask." Why not - * have just "tracing_cpumask?" Because the user might be interested - * in tracing what is running on other CPUs. For instance, one might - * run osnoise in one HT CPU while observing what is running on the - * sibling HT CPU. + * set of CPUs writing via this interface. Why not use "tracing_cpumask"? + * Because the user might be interested in tracing what is running on + * other CPUs. For instance, one might run osnoise in one HT CPU + * while observing what is running on the sibling HT CPU. */ static ssize_t osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count,
From: Daniel Bristot de Oliveira bristot@kernel.org
mainline inclusion from mainline-v5.16-rc1 commit c3b6343c0dc4a76f838e25391f6f1cdb25cfbb8c category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4G64B CVE: NA
-------------------------------------------------
trace_osnoise_callback_enabled is used by ftrace_nmi_enter/exit() to know when to call the NMI callback. The barrier is used to avoid having callbacks enabled before the resetting date during the start or to touch the values after stopping the tracer.
Link: https://lkml.kernel.org/r/a413b8f14aa9312fbd1ba99f96225a8aed831053.163570289...
Cc: Ingo Molnar mingo@redhat.com Cc: Tom Zanussi zanussi@kernel.org Cc: Masami Hiramatsu mhiramat@kernel.org Cc: Juri Lelli juri.lelli@redhat.com Cc: Clark Williams williams@redhat.com Cc: John Kacur jkacur@redhat.com Cc: Peter Zijlstra peterz@infradead.org Cc: Thomas Gleixner tglx@linutronix.de Cc: Sebastian Andrzej Siewior bigeasy@linutronix.de Cc: Daniel Bristot de Oliveira bristot@kernel.org Cc: linux-rt-users@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Suggested-by: Steven Rostedt rostedt@goodmis.org Signed-off-by: Daniel Bristot de Oliveira bristot@kernel.org Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/trace/trace_osnoise.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index b3f1c09870ca..2bfc4f10f7d6 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1907,8 +1907,10 @@ static int __osnoise_tracer_start(struct trace_array *tr) retval = osnoise_hook_events(); if (retval) return retval; + /* - * Make sure NMIs see reseted values. + * Make sure that ftrace_nmi_enter/exit() see reset values + * before enabling trace_osnoise_callback_enabled. */ barrier(); trace_osnoise_callback_enabled = true; @@ -1943,6 +1945,10 @@ static void osnoise_tracer_stop(struct trace_array *tr) return;
trace_osnoise_callback_enabled = false; + /* + * Make sure that ftrace_nmi_enter/exit() see + * trace_osnoise_callback_enabled as false before continuing. + */ barrier();
stop_per_cpu_kthreads();
From: Daniel Bristot de Oliveira bristot@kernel.org
mainline inclusion from mainline-v5.16-rc1 commit 15ca4bdb0327b35e09682a0f7975e21688f54306 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4G64B CVE: NA
-------------------------------------------------
In preparation from supporting multiple trace instances, create workload start/stop specific functions.
No functional change.
Link: https://lkml.kernel.org/r/74b090971e9acdd13625be1c28ef3270d2275e77.163570289...
Cc: Ingo Molnar mingo@redhat.com Cc: Tom Zanussi zanussi@kernel.org Cc: Masami Hiramatsu mhiramat@kernel.org Cc: Juri Lelli juri.lelli@redhat.com Cc: Clark Williams williams@redhat.com Cc: John Kacur jkacur@redhat.com Cc: Peter Zijlstra peterz@infradead.org Cc: Thomas Gleixner tglx@linutronix.de Cc: Sebastian Andrzej Siewior bigeasy@linutronix.de Cc: Daniel Bristot de Oliveira bristot@kernel.org Cc: linux-rt-users@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira bristot@kernel.org Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/trace/trace_osnoise.c | 59 ++++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 23 deletions(-)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 2bfc4f10f7d6..225888c9164b 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1523,7 +1523,7 @@ static int start_kthread(unsigned int cpu) * This starts the kernel thread that will look for osnoise on many * cpus. */ -static int start_per_cpu_kthreads(struct trace_array *tr) +static int start_per_cpu_kthreads(void) { struct cpumask *current_mask = &save_cpumask; int retval = 0; @@ -1655,8 +1655,8 @@ osnoise_cpus_read(struct file *filp, char __user *ubuf, size_t count, return count; }
-static void osnoise_tracer_start(struct trace_array *tr); -static void osnoise_tracer_stop(struct trace_array *tr); +static int osnoise_workload_start(void); +static void osnoise_workload_stop(void);
/* * osnoise_cpus_write - Write function for "cpus" entry @@ -1678,7 +1678,6 @@ static ssize_t osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, loff_t *ppos) { - struct trace_array *tr = osnoise_trace; cpumask_var_t osnoise_cpumask_new; int running, err; char buf[256]; @@ -1703,7 +1702,7 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, mutex_lock(&trace_types_lock); running = osnoise_busy; if (running) - osnoise_tracer_stop(tr); + osnoise_workload_stop();
mutex_lock(&interface_lock); /* @@ -1717,7 +1716,7 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, mutex_unlock(&interface_lock);
if (running) - osnoise_tracer_start(tr); + osnoise_workload_start(); mutex_unlock(&trace_types_lock);
free_cpumask_var(osnoise_cpumask_new); @@ -1898,7 +1897,10 @@ static int osnoise_hook_events(void) return -EINVAL; }
-static int __osnoise_tracer_start(struct trace_array *tr) +/* + * osnoise_workload_start - start the workload and hook to events + */ +static int osnoise_workload_start(void) { int retval;
@@ -1915,7 +1917,7 @@ static int __osnoise_tracer_start(struct trace_array *tr) barrier(); trace_osnoise_callback_enabled = true;
- retval = start_per_cpu_kthreads(tr); + retval = start_per_cpu_kthreads(); if (retval) { unhook_irq_events(); return retval; @@ -1926,20 +1928,10 @@ static int __osnoise_tracer_start(struct trace_array *tr) return 0; }
-static void osnoise_tracer_start(struct trace_array *tr) -{ - int retval; - - if (osnoise_busy) - return; - - retval = __osnoise_tracer_start(tr); - if (retval) - pr_err(BANNER "Error starting osnoise tracer\n"); - -} - -static void osnoise_tracer_stop(struct trace_array *tr) +/* + * osnoise_workload_stop - stop the workload and unhook the events + */ +static void osnoise_workload_stop(void) { if (!osnoise_busy) return; @@ -1960,6 +1952,27 @@ static void osnoise_tracer_stop(struct trace_array *tr) osnoise_busy = false; }
+static void osnoise_tracer_start(struct trace_array *tr) +{ + int retval; + + if (osnoise_busy) + return; + + retval = osnoise_workload_start(); + if (retval) + pr_err(BANNER "Error starting osnoise tracer\n"); + +} + +static void osnoise_tracer_stop(struct trace_array *tr) +{ + if (!osnoise_busy) + return; + + osnoise_workload_stop(); +} + static int osnoise_tracer_init(struct trace_array *tr) {
@@ -2000,7 +2013,7 @@ static void timerlat_tracer_start(struct trace_array *tr)
osnoise_data.timerlat_tracer = 1;
- retval = __osnoise_tracer_start(tr); + retval = osnoise_workload_start(); if (retval) goto out_err;
From: Daniel Bristot de Oliveira bristot@kernel.org
mainline inclusion from mainline-v5.16-rc1 commit 2bd1bdf01fb25906f18cd8ebfac81c2217d1478a category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4G64B CVE: NA
-------------------------------------------------
When writing a new CPU mask via osnoise/cpus, if the tracer is running, the workload is restarted to follow the new cpumask. The restart is currently done using osnoise_workload_start/stop(), which disables the workload *and* the instrumentation. However, disabling the instrumentation is not necessary.
Calling start/stop_per_cpu_kthreads() is enough to apply the new osnoise/cpus config.
Link: https://lkml.kernel.org/r/ee633e82867c5b88851aa6040522a799c0034486.163570289...
Cc: Ingo Molnar mingo@redhat.com Cc: Tom Zanussi zanussi@kernel.org Cc: Masami Hiramatsu mhiramat@kernel.org Cc: Juri Lelli juri.lelli@redhat.com Cc: Clark Williams williams@redhat.com Cc: John Kacur jkacur@redhat.com Cc: Peter Zijlstra peterz@infradead.org Cc: Thomas Gleixner tglx@linutronix.de Cc: Sebastian Andrzej Siewior bigeasy@linutronix.de Cc: Daniel Bristot de Oliveira bristot@kernel.org Cc: linux-rt-users@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira bristot@kernel.org Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/trace/trace_osnoise.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 225888c9164b..d6cedd2e039b 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1655,9 +1655,6 @@ osnoise_cpus_read(struct file *filp, char __user *ubuf, size_t count, return count; }
-static int osnoise_workload_start(void); -static void osnoise_workload_stop(void); - /* * osnoise_cpus_write - Write function for "cpus" entry * @filp: The active open file structure @@ -1702,7 +1699,7 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, mutex_lock(&trace_types_lock); running = osnoise_busy; if (running) - osnoise_workload_stop(); + stop_per_cpu_kthreads();
mutex_lock(&interface_lock); /* @@ -1716,7 +1713,7 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, mutex_unlock(&interface_lock);
if (running) - osnoise_workload_start(); + start_per_cpu_kthreads(); mutex_unlock(&trace_types_lock);
free_cpumask_var(osnoise_cpumask_new);
From: Daniel Bristot de Oliveira bristot@kernel.org
mainline inclusion from mainline-v5.16-rc1 commit dae181349f1e9d279f171afc708d2824ab35a86f category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4G64B CVE: NA
-------------------------------------------------
osnoise/timerlat were built to run a single instance, and for this, a single variable is enough to store the current struct trace_array *tr with information about the tracing instance. This is done via the *osnoise_trace variable. A trace_array represents a trace instance.
In preparation to support multiple instances, replace the *osnoise_trace variable with an RCU protected list of instances.
The operations that refer to an instance now propagate to all elements of the list (all instances).
Also, replace the osnoise_busy variable with a check if the list has elements (busy).
No functional change is expected with this patch, i.e., only one instance is allowed yet.
Link: https://lkml.kernel.org/r/91d006e889b9a5d1ff258fe6077f021ae3f26372.163570289...
Cc: Ingo Molnar mingo@redhat.com Cc: Tom Zanussi zanussi@kernel.org Cc: Masami Hiramatsu mhiramat@kernel.org Cc: Juri Lelli juri.lelli@redhat.com Cc: Clark Williams williams@redhat.com Cc: John Kacur jkacur@redhat.com Cc: Peter Zijlstra peterz@infradead.org Cc: Thomas Gleixner tglx@linutronix.de Cc: Sebastian Andrzej Siewior bigeasy@linutronix.de Cc: Daniel Bristot de Oliveira bristot@kernel.org Cc: linux-rt-users@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira bristot@kernel.org Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/trace/trace_osnoise.c | 261 ++++++++++++++++++++++++++--------- 1 file changed, 192 insertions(+), 69 deletions(-)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index d6cedd2e039b..9980047834b5 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -38,8 +38,6 @@ #define CREATE_TRACE_POINTS #include <trace/events/osnoise.h>
-static struct trace_array *osnoise_trace; - /* * Default values. */ @@ -50,6 +48,81 @@ static struct trace_array *osnoise_trace; #define DEFAULT_TIMERLAT_PERIOD 1000 /* 1ms */ #define DEFAULT_TIMERLAT_PRIO 95 /* FIFO 95 */
+/* + * trace_array of the enabled osnoise/timerlat instances. + */ +struct osnoise_instance { + struct list_head list; + struct trace_array *tr; +}; +struct list_head osnoise_instances; + +static bool osnoise_has_registered_instances(void) +{ + return !!list_first_or_null_rcu(&osnoise_instances, + struct osnoise_instance, + list); +} + +/* + * osnoise_register_instance - register a new trace instance + * + * Register a trace_array *tr in the list of instances running + * osnoise/timerlat tracers. + */ +static int osnoise_register_instance(struct trace_array *tr) +{ + struct osnoise_instance *inst; + + /* + * register/unregister serialization is provided by trace's + * trace_types_lock. + */ + lockdep_assert_held(&trace_types_lock); + + inst = kmalloc(sizeof(*inst), GFP_KERNEL); + if (!inst) + return -ENOMEM; + + INIT_LIST_HEAD_RCU(&inst->list); + inst->tr = tr; + list_add_tail_rcu(&inst->list, &osnoise_instances); + + return 0; +} + +/* + * osnoise_unregister_instance - unregister a registered trace instance + * + * Remove the trace_array *tr from the list of instances running + * osnoise/timerlat tracers. + */ +static void osnoise_unregister_instance(struct trace_array *tr) +{ + struct osnoise_instance *inst; + int found = 0; + + /* + * register/unregister serialization is provided by trace's + * trace_types_lock. + */ + lockdep_assert_held(&trace_types_lock); + + list_for_each_entry_rcu(inst, &osnoise_instances, list) { + if (inst->tr == tr) { + list_del_rcu(&inst->list); + found = 1; + break; + } + } + + if (!found) + return; + + synchronize_rcu(); + kfree(inst); +} + /* * NMI runtime info. */ @@ -248,11 +321,6 @@ static struct osnoise_data { #endif };
-/* - * Boolean variable used to inform that the tracer is currently sampling. - */ -static bool osnoise_busy; - #ifdef CONFIG_PREEMPT_RT /* * Print the osnoise header info. @@ -315,19 +383,24 @@ static void print_osnoise_headers(struct seq_file *s) * osnoise_taint - report an osnoise error. */ #define osnoise_taint(msg) ({ \ - struct trace_array *tr = osnoise_trace; \ + struct osnoise_instance *inst; \ + struct trace_buffer *buffer; \ \ - trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, msg); \ + rcu_read_lock(); \ + list_for_each_entry_rcu(inst, &osnoise_instances, list) { \ + buffer = inst->tr->array_buffer.buffer; \ + trace_array_printk_buf(buffer, _THIS_IP_, msg); \ + } \ + rcu_read_unlock(); \ osnoise_data.tainted = true; \ })
/* * Record an osnoise_sample into the tracer buffer. */ -static void trace_osnoise_sample(struct osnoise_sample *sample) +static void +__trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffer) { - struct trace_array *tr = osnoise_trace; - struct trace_buffer *buffer = tr->array_buffer.buffer; struct trace_event_call *call = &event_osnoise; struct ring_buffer_event *event; struct osnoise_entry *entry; @@ -350,6 +423,22 @@ static void trace_osnoise_sample(struct osnoise_sample *sample) trace_buffer_unlock_commit_nostack(buffer, event); }
+/* + * Record an osnoise_sample on all osnoise instances. + */ +static void trace_osnoise_sample(struct osnoise_sample *sample) +{ + struct osnoise_instance *inst; + struct trace_buffer *buffer; + + rcu_read_lock(); + list_for_each_entry_rcu(inst, &osnoise_instances, list) { + buffer = inst->tr->array_buffer.buffer; + __trace_osnoise_sample(sample, buffer); + } + rcu_read_unlock(); +} + #ifdef CONFIG_TIMERLAT_TRACER /* * Print the timerlat header info. @@ -368,14 +457,10 @@ static void print_timerlat_headers(struct seq_file *s) seq_puts(s, " | |\n"); }
-/* - * Record an timerlat_sample into the tracer buffer. - */ -static void trace_timerlat_sample(struct timerlat_sample *sample) +static void +__trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buffer) { - struct trace_array *tr = osnoise_trace; struct trace_event_call *call = &event_osnoise; - struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct timerlat_entry *entry;
@@ -392,6 +477,22 @@ static void trace_timerlat_sample(struct timerlat_sample *sample) trace_buffer_unlock_commit_nostack(buffer, event); }
+/* + * Record an timerlat_sample into the tracer buffer. + */ +static void trace_timerlat_sample(struct timerlat_sample *sample) +{ + struct osnoise_instance *inst; + struct trace_buffer *buffer; + + rcu_read_lock(); + list_for_each_entry_rcu(inst, &osnoise_instances, list) { + buffer = inst->tr->array_buffer.buffer; + __trace_timerlat_sample(sample, buffer); + } + rcu_read_unlock(); +} + #ifdef CONFIG_STACKTRACE
#define MAX_CALLS 256 @@ -431,29 +532,18 @@ static void timerlat_save_stack(int skip) return;
} -/* - * timerlat_dump_stack - dump a stack trace previously saved - * - * Dump a saved stack trace into the trace buffer. - */ -static void timerlat_dump_stack(void) + +static void +__timerlat_dump_stack(struct trace_buffer *buffer, struct trace_stack *fstack, unsigned int size) { struct trace_event_call *call = &event_osnoise; - struct trace_array *tr = osnoise_trace; - struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; - struct trace_stack *fstack; struct stack_entry *entry; - unsigned int size; - - preempt_disable_notrace(); - fstack = this_cpu_ptr(&trace_stack); - size = fstack->stack_size;
event = trace_buffer_lock_reserve(buffer, TRACE_STACK, sizeof(*entry) + size, tracing_gen_ctx()); if (!event) - goto out; + return;
entry = ring_buffer_event_data(event);
@@ -462,8 +552,29 @@ static void timerlat_dump_stack(void)
if (!call_filter_check_discard(call, entry, buffer, event)) trace_buffer_unlock_commit_nostack(buffer, event); +}
-out: +/* + * timerlat_dump_stack - dump a stack trace previously saved + */ +static void timerlat_dump_stack(void) +{ + struct osnoise_instance *inst; + struct trace_buffer *buffer; + struct trace_stack *fstack; + unsigned int size; + + preempt_disable_notrace(); + fstack = this_cpu_ptr(&trace_stack); + size = fstack->stack_size; + + rcu_read_lock(); + list_for_each_entry_rcu(inst, &osnoise_instances, list) { + buffer = inst->tr->array_buffer.buffer; + __timerlat_dump_stack(buffer, fstack, size); + + } + rcu_read_unlock(); preempt_enable_notrace(); } #else @@ -1058,12 +1169,37 @@ diff_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample * */ static __always_inline void osnoise_stop_tracing(void) { - struct trace_array *tr = osnoise_trace; + struct osnoise_instance *inst; + struct trace_array *tr; + + rcu_read_lock(); + list_for_each_entry_rcu(inst, &osnoise_instances, list) { + tr = inst->tr; + trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, + "stop tracing hit on cpu %d\n", smp_processor_id()); + + tracer_tracing_off(tr); + } + rcu_read_unlock(); +}
- trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, - "stop tracing hit on cpu %d\n", smp_processor_id()); +/* + * notify_new_max_latency - Notify a new max latency via fsnotify interface. + */ +static void notify_new_max_latency(u64 latency) +{ + struct osnoise_instance *inst; + struct trace_array *tr;
- tracer_tracing_off(tr); + rcu_read_lock(); + list_for_each_entry_rcu(inst, &osnoise_instances, list) { + tr = inst->tr; + if (tr->max_latency < latency) { + tr->max_latency = latency; + latency_fsnotify(tr); + } + } + rcu_read_unlock(); }
/* @@ -1077,7 +1213,6 @@ static __always_inline void osnoise_stop_tracing(void) static int run_osnoise(void) { struct osnoise_variables *osn_var = this_cpu_osn_var(); - struct trace_array *tr = osnoise_trace; u64 start, sample, last_sample; u64 last_int_count, int_count; s64 noise = 0, max_noise = 0; @@ -1212,11 +1347,7 @@ static int run_osnoise(void)
trace_osnoise_sample(&s);
- /* Keep a running maximum ever recorded osnoise "latency" */ - if (max_noise > tr->max_latency) { - tr->max_latency = max_noise; - latency_fsnotify(tr); - } + notify_new_max_latency(max_noise);
if (osnoise_data.stop_tracing_total) if (s.noise > osnoise_data.stop_tracing_total) @@ -1271,7 +1402,6 @@ static int osnoise_main(void *data) static enum hrtimer_restart timerlat_irq(struct hrtimer *timer) { struct osnoise_variables *osn_var = this_cpu_osn_var(); - struct trace_array *tr = osnoise_trace; struct timerlat_variables *tlat; struct timerlat_sample s; u64 now; @@ -1342,11 +1472,7 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer)
trace_timerlat_sample(&s);
- /* Keep a running maximum ever recorded os noise "latency" */ - if (diff > tr->max_latency) { - tr->max_latency = diff; - latency_fsnotify(tr); - } + notify_new_max_latency(diff);
if (osnoise_data.stop_tracing) if (time_to_us(diff) >= osnoise_data.stop_tracing) @@ -1558,7 +1684,7 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy)
mutex_lock(&trace_types_lock);
- if (!osnoise_busy) + if (!osnoise_has_registered_instances()) goto out_unlock_trace;
mutex_lock(&interface_lock); @@ -1693,11 +1819,10 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, goto err_free;
/* - * trace_types_lock is taken to avoid concurrency on start/stop - * and osnoise_busy. + * trace_types_lock is taken to avoid concurrency on start/stop. */ mutex_lock(&trace_types_lock); - running = osnoise_busy; + running = osnoise_has_registered_instances(); if (running) stop_per_cpu_kthreads();
@@ -1920,8 +2045,6 @@ static int osnoise_workload_start(void) return retval; }
- osnoise_busy = true; - return 0; }
@@ -1930,7 +2053,7 @@ static int osnoise_workload_start(void) */ static void osnoise_workload_stop(void) { - if (!osnoise_busy) + if (osnoise_has_registered_instances()) return;
trace_osnoise_callback_enabled = false; @@ -1945,28 +2068,28 @@ static void osnoise_workload_stop(void) unhook_irq_events(); unhook_softirq_events(); unhook_thread_events(); - - osnoise_busy = false; }
static void osnoise_tracer_start(struct trace_array *tr) { int retval;
- if (osnoise_busy) + if (osnoise_has_registered_instances()) return;
retval = osnoise_workload_start(); if (retval) pr_err(BANNER "Error starting osnoise tracer\n");
+ osnoise_register_instance(tr); }
static void osnoise_tracer_stop(struct trace_array *tr) { - if (!osnoise_busy) + if (!osnoise_has_registered_instances()) return;
+ osnoise_unregister_instance(tr); osnoise_workload_stop(); }
@@ -1974,14 +2097,12 @@ static int osnoise_tracer_init(struct trace_array *tr) {
/* Only allow one instance to enable this */ - if (osnoise_busy) + if (osnoise_has_registered_instances()) return -EBUSY;
- osnoise_trace = tr; tr->max_latency = 0;
osnoise_tracer_start(tr); - return 0; }
@@ -2005,7 +2126,7 @@ static void timerlat_tracer_start(struct trace_array *tr) { int retval;
- if (osnoise_busy) + if (osnoise_has_registered_instances()) return;
osnoise_data.timerlat_tracer = 1; @@ -2014,6 +2135,8 @@ static void timerlat_tracer_start(struct trace_array *tr) if (retval) goto out_err;
+ osnoise_register_instance(tr); + return; out_err: pr_err(BANNER "Error starting timerlat tracer\n"); @@ -2023,7 +2146,7 @@ static void timerlat_tracer_stop(struct trace_array *tr) { int cpu;
- if (!osnoise_busy) + if (!osnoise_has_registered_instances()) return;
for_each_online_cpu(cpu) @@ -2037,11 +2160,9 @@ static void timerlat_tracer_stop(struct trace_array *tr) static int timerlat_tracer_init(struct trace_array *tr) { /* Only allow one instance to enable this */ - if (osnoise_busy) + if (osnoise_has_registered_instances()) return -EBUSY;
- osnoise_trace = tr; - tr->max_latency = 0;
timerlat_tracer_start(tr); @@ -2088,6 +2209,8 @@ __init static int init_osnoise_tracer(void) #endif osnoise_init_hotplug_support();
+ INIT_LIST_HEAD_RCU(&osnoise_instances); + init_tracefs();
return 0;
From: "Steven Rostedt (VMware)" rostedt@goodmis.org
mainline inclusion from mainline-v5.16-rc1 commit 21ccc9cd72116289469e5519b6159c675a2fa58f category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4G64B CVE: NA
-------------------------------------------------
When building the files in the tracefs file system, do not by default set any permissions for OTH (other). This will make it easier for admins who want to define a group for accessing tracefs and not having to first disable all the permission bits for "other" in the file system.
As tracing can leak sensitive information, it should never by default allowing all users access. An admin can still set the permission bits for others to have access, which may be useful for creating a honeypot and seeing who takes advantage of it and roots the machine.
Link: https://lkml.kernel.org/r/20210818153038.864149276@goodmis.org
Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/trace/ftrace.c | 23 ++++----- kernel/trace/trace.c | 73 ++++++++++++++-------------- kernel/trace/trace.h | 3 ++ kernel/trace/trace_dynevent.c | 2 +- kernel/trace/trace_events.c | 37 +++++++------- kernel/trace/trace_events_synth.c | 4 +- kernel/trace/trace_functions_graph.c | 2 +- kernel/trace/trace_hwlat.c | 4 +- kernel/trace/trace_kprobe.c | 8 +-- kernel/trace/trace_osnoise.c | 10 ++-- kernel/trace/trace_printk.c | 2 +- kernel/trace/trace_stack.c | 6 +-- kernel/trace/trace_stat.c | 6 +-- kernel/trace/trace_uprobe.c | 4 +- 14 files changed, 95 insertions(+), 89 deletions(-)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4a5d35dc490b..a1c517ed769b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -988,8 +988,9 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer) } }
- entry = tracefs_create_file("function_profile_enabled", 0644, - d_tracer, NULL, &ftrace_profile_fops); + entry = tracefs_create_file("function_profile_enabled", + TRACE_MODE_WRITE, d_tracer, NULL, + &ftrace_profile_fops); if (!entry) pr_warn("Could not create tracefs 'function_profile_enabled' entry\n"); } @@ -6117,10 +6118,10 @@ void ftrace_create_filter_files(struct ftrace_ops *ops, struct dentry *parent) {
- trace_create_file("set_ftrace_filter", 0644, parent, + trace_create_file("set_ftrace_filter", TRACE_MODE_WRITE, parent, ops, &ftrace_filter_fops);
- trace_create_file("set_ftrace_notrace", 0644, parent, + trace_create_file("set_ftrace_notrace", TRACE_MODE_WRITE, parent, ops, &ftrace_notrace_fops); }
@@ -6147,19 +6148,19 @@ void ftrace_destroy_filter_files(struct ftrace_ops *ops) static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer) {
- trace_create_file("available_filter_functions", 0444, + trace_create_file("available_filter_functions", TRACE_MODE_READ, d_tracer, NULL, &ftrace_avail_fops);
- trace_create_file("enabled_functions", 0444, + trace_create_file("enabled_functions", TRACE_MODE_READ, d_tracer, NULL, &ftrace_enabled_fops);
ftrace_create_filter_files(&global_ops, d_tracer);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER - trace_create_file("set_graph_function", 0644, d_tracer, + trace_create_file("set_graph_function", TRACE_MODE_WRITE, d_tracer, NULL, &ftrace_graph_fops); - trace_create_file("set_graph_notrace", 0644, d_tracer, + trace_create_file("set_graph_notrace", TRACE_MODE_WRITE, d_tracer, NULL, &ftrace_graph_notrace_fops); #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ @@ -7502,10 +7503,10 @@ static const struct file_operations ftrace_no_pid_fops = {
void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer) { - trace_create_file("set_ftrace_pid", 0644, d_tracer, + trace_create_file("set_ftrace_pid", TRACE_MODE_WRITE, d_tracer, tr, &ftrace_pid_fops); - trace_create_file("set_ftrace_notrace_pid", 0644, d_tracer, - tr, &ftrace_no_pid_fops); + trace_create_file("set_ftrace_notrace_pid", TRACE_MODE_WRITE, + d_tracer, tr, &ftrace_no_pid_fops); }
void __init ftrace_init_tracefs_toplevel(struct trace_array *tr, diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3499324efe20..e2df88b691e8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1703,7 +1703,8 @@ static void trace_create_maxlat_file(struct trace_array *tr, { INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn); init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq); - tr->d_max_latency = trace_create_file("tracing_max_latency", 0644, + tr->d_max_latency = trace_create_file("tracing_max_latency", + TRACE_MODE_WRITE, d_tracer, &tr->max_latency, &tracing_max_lat_fops); } @@ -1737,8 +1738,8 @@ void latency_fsnotify(struct trace_array *tr) || defined(CONFIG_OSNOISE_TRACER)
#define trace_create_maxlat_file(tr, d_tracer) \ - trace_create_file("tracing_max_latency", 0644, d_tracer, \ - &tr->max_latency, &tracing_max_lat_fops) + trace_create_file("tracing_max_latency", TRACE_MODE_WRITE, \ + d_tracer, &tr->max_latency, &tracing_max_lat_fops)
#else #define trace_create_maxlat_file(tr, d_tracer) do { } while (0) @@ -5701,7 +5702,7 @@ trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start,
static void trace_create_eval_file(struct dentry *d_tracer) { - trace_create_file("eval_map", 0444, d_tracer, + trace_create_file("eval_map", TRACE_MODE_READ, d_tracer, NULL, &tracing_eval_map_fops); }
@@ -8211,27 +8212,27 @@ tracing_init_tracefs_percpu(struct trace_array *tr, long cpu) }
/* per cpu trace_pipe */ - trace_create_cpu_file("trace_pipe", 0444, d_cpu, + trace_create_cpu_file("trace_pipe", TRACE_MODE_READ, d_cpu, tr, cpu, &tracing_pipe_fops);
/* per cpu trace */ - trace_create_cpu_file("trace", 0644, d_cpu, + trace_create_cpu_file("trace", TRACE_MODE_WRITE, d_cpu, tr, cpu, &tracing_fops);
- trace_create_cpu_file("trace_pipe_raw", 0444, d_cpu, + trace_create_cpu_file("trace_pipe_raw", TRACE_MODE_READ, d_cpu, tr, cpu, &tracing_buffers_fops);
- trace_create_cpu_file("stats", 0444, d_cpu, + trace_create_cpu_file("stats", TRACE_MODE_READ, d_cpu, tr, cpu, &tracing_stats_fops);
- trace_create_cpu_file("buffer_size_kb", 0444, d_cpu, + trace_create_cpu_file("buffer_size_kb", TRACE_MODE_READ, d_cpu, tr, cpu, &tracing_entries_fops);
#ifdef CONFIG_TRACER_SNAPSHOT - trace_create_cpu_file("snapshot", 0644, d_cpu, + trace_create_cpu_file("snapshot", TRACE_MODE_WRITE, d_cpu, tr, cpu, &snapshot_fops);
- trace_create_cpu_file("snapshot_raw", 0444, d_cpu, + trace_create_cpu_file("snapshot_raw", TRACE_MODE_READ, d_cpu, tr, cpu, &snapshot_raw_fops); #endif } @@ -8437,8 +8438,8 @@ create_trace_option_file(struct trace_array *tr, topt->opt = opt; topt->tr = tr;
- topt->entry = trace_create_file(opt->name, 0644, t_options, topt, - &trace_options_fops); + topt->entry = trace_create_file(opt->name, TRACE_MODE_WRITE, + t_options, topt, &trace_options_fops);
}
@@ -8513,7 +8514,7 @@ create_trace_option_core_file(struct trace_array *tr, if (!t_options) return NULL;
- return trace_create_file(option, 0644, t_options, + return trace_create_file(option, TRACE_MODE_WRITE, t_options, (void *)&tr->trace_flags_index[index], &trace_options_core_fops); } @@ -9037,28 +9038,28 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) struct trace_event_file *file; int cpu;
- trace_create_file("available_tracers", 0444, d_tracer, + trace_create_file("available_tracers", TRACE_MODE_READ, d_tracer, tr, &show_traces_fops);
- trace_create_file("current_tracer", 0644, d_tracer, + trace_create_file("current_tracer", TRACE_MODE_WRITE, d_tracer, tr, &set_tracer_fops);
- trace_create_file("tracing_cpumask", 0644, d_tracer, + trace_create_file("tracing_cpumask", TRACE_MODE_WRITE, d_tracer, tr, &tracing_cpumask_fops);
- trace_create_file("trace_options", 0644, d_tracer, + trace_create_file("trace_options", TRACE_MODE_WRITE, d_tracer, tr, &tracing_iter_fops);
- trace_create_file("trace", 0644, d_tracer, + trace_create_file("trace", TRACE_MODE_WRITE, d_tracer, tr, &tracing_fops);
- trace_create_file("trace_pipe", 0444, d_tracer, + trace_create_file("trace_pipe", TRACE_MODE_READ, d_tracer, tr, &tracing_pipe_fops);
- trace_create_file("buffer_size_kb", 0644, d_tracer, + trace_create_file("buffer_size_kb", TRACE_MODE_WRITE, d_tracer, tr, &tracing_entries_fops);
- trace_create_file("buffer_total_size_kb", 0444, d_tracer, + trace_create_file("buffer_total_size_kb", TRACE_MODE_READ, d_tracer, tr, &tracing_total_entries_fops);
trace_create_file("free_buffer", 0200, d_tracer, @@ -9069,25 +9070,25 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
file = __find_event_file(tr, "ftrace", "print"); if (file && file->dir) - trace_create_file("trigger", 0644, file->dir, file, - &event_trigger_fops); + trace_create_file("trigger", TRACE_MODE_WRITE, file->dir, + file, &event_trigger_fops); tr->trace_marker_file = file;
trace_create_file("trace_marker_raw", 0220, d_tracer, tr, &tracing_mark_raw_fops);
- trace_create_file("trace_clock", 0644, d_tracer, tr, + trace_create_file("trace_clock", TRACE_MODE_WRITE, d_tracer, tr, &trace_clock_fops);
- trace_create_file("tracing_on", 0644, d_tracer, + trace_create_file("tracing_on", TRACE_MODE_WRITE, d_tracer, tr, &rb_simple_fops);
- trace_create_file("timestamp_mode", 0444, d_tracer, tr, + trace_create_file("timestamp_mode", TRACE_MODE_READ, d_tracer, tr, &trace_time_stamp_mode_fops);
tr->buffer_percent = 50;
- trace_create_file("buffer_percent", 0444, d_tracer, + trace_create_file("buffer_percent", TRACE_MODE_READ, d_tracer, tr, &buffer_percent_fops);
create_trace_options_dir(tr); @@ -9098,11 +9099,11 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) MEM_FAIL(1, "Could not allocate function filter files");
#ifdef CONFIG_TRACER_SNAPSHOT - trace_create_file("snapshot", 0644, d_tracer, + trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer, tr, &snapshot_fops); #endif
- trace_create_file("error_log", 0644, d_tracer, + trace_create_file("error_log", TRACE_MODE_WRITE, d_tracer, tr, &tracing_err_log_fops);
for_each_tracing_cpu(cpu) @@ -9265,19 +9266,19 @@ static __init int tracer_init_tracefs(void) init_tracer_tracefs(&global_trace, NULL); ftrace_init_tracefs_toplevel(&global_trace, NULL);
- trace_create_file("tracing_thresh", 0644, NULL, + trace_create_file("tracing_thresh", TRACE_MODE_WRITE, NULL, &global_trace, &tracing_thresh_fops);
- trace_create_file("README", 0444, NULL, + trace_create_file("README", TRACE_MODE_READ, NULL, NULL, &tracing_readme_fops);
- trace_create_file("saved_cmdlines", 0444, NULL, + trace_create_file("saved_cmdlines", TRACE_MODE_READ, NULL, NULL, &tracing_saved_cmdlines_fops);
- trace_create_file("saved_cmdlines_size", 0644, NULL, + trace_create_file("saved_cmdlines_size", TRACE_MODE_WRITE, NULL, NULL, &tracing_saved_cmdlines_size_fops);
- trace_create_file("saved_tgids", 0444, NULL, + trace_create_file("saved_tgids", TRACE_MODE_READ, NULL, NULL, &tracing_saved_tgids_fops);
trace_eval_init(); @@ -9289,7 +9290,7 @@ static __init int tracer_init_tracefs(void) #endif
#ifdef CONFIG_DYNAMIC_FTRACE - trace_create_file("dyn_ftrace_total_info", 0444, NULL, + trace_create_file("dyn_ftrace_total_info", TRACE_MODE_READ, NULL, NULL, &tracing_dyn_info_fops); #endif
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 46c8a6ac9e3c..aaae8c24c82e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -26,6 +26,9 @@ #include <asm/syscall.h> /* some archs define it here */ #endif
+#define TRACE_MODE_WRITE 0640 +#define TRACE_MODE_READ 0440 + enum trace_type { __TRACE_FIRST_TYPE = 0,
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 5fa49cfd2bb6..eb776cfb9a8f 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -213,7 +213,7 @@ static __init int init_dynamic_event(void) if (ret) return 0;
- entry = tracefs_create_file("dynamic_events", 0644, NULL, + entry = tracefs_create_file("dynamic_events", TRACE_MODE_WRITE, NULL, NULL, &dynamic_events_ops);
/* Event list interface */ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 546a535f1490..5af763a73948 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2187,12 +2187,12 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) }
if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) - trace_create_file("enable", 0644, file->dir, file, + trace_create_file("enable", TRACE_MODE_WRITE, file->dir, file, &ftrace_enable_fops);
#ifdef CONFIG_PERF_EVENTS if (call->event.type && call->class->reg) - trace_create_file("id", 0444, file->dir, + trace_create_file("id", TRACE_MODE_READ, file->dir, (void *)(long)call->event.type, &ftrace_event_id_fops); #endif @@ -2208,22 +2208,22 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) * triggers or filters. */ if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) { - trace_create_file("filter", 0644, file->dir, file, - &ftrace_event_filter_fops); + trace_create_file("filter", TRACE_MODE_WRITE, file->dir, + file, &ftrace_event_filter_fops);
- trace_create_file("trigger", 0644, file->dir, file, - &event_trigger_fops); + trace_create_file("trigger", TRACE_MODE_WRITE, file->dir, + file, &event_trigger_fops); }
#ifdef CONFIG_HIST_TRIGGERS - trace_create_file("hist", 0444, file->dir, file, + trace_create_file("hist", TRACE_MODE_READ, file->dir, file, &event_hist_fops); #endif #ifdef CONFIG_HIST_TRIGGERS_DEBUG - trace_create_file("hist_debug", 0444, file->dir, file, + trace_create_file("hist_debug", TRACE_MODE_READ, file->dir, file, &event_hist_debug_fops); #endif - trace_create_file("format", 0444, file->dir, call, + trace_create_file("format", TRACE_MODE_READ, file->dir, call, &ftrace_event_format_fops);
#ifdef CONFIG_TRACE_EVENT_INJECT @@ -3212,7 +3212,7 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) struct dentry *d_events; struct dentry *entry;
- entry = tracefs_create_file("set_event", 0644, parent, + entry = tracefs_create_file("set_event", TRACE_MODE_WRITE, parent, tr, &ftrace_set_event_fops); if (!entry) { pr_warn("Could not create tracefs 'set_event' entry\n"); @@ -3225,7 +3225,7 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) return -ENOMEM; }
- entry = trace_create_file("enable", 0644, d_events, + entry = trace_create_file("enable", TRACE_MODE_WRITE, d_events, tr, &ftrace_tr_enable_fops); if (!entry) { pr_warn("Could not create tracefs 'enable' entry\n"); @@ -3234,24 +3234,25 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
/* There are not as crucial, just warn if they are not created */
- entry = tracefs_create_file("set_event_pid", 0644, parent, + entry = tracefs_create_file("set_event_pid", TRACE_MODE_WRITE, parent, tr, &ftrace_set_event_pid_fops); if (!entry) pr_warn("Could not create tracefs 'set_event_pid' entry\n");
- entry = tracefs_create_file("set_event_notrace_pid", 0644, parent, - tr, &ftrace_set_event_notrace_pid_fops); + entry = tracefs_create_file("set_event_notrace_pid", + TRACE_MODE_WRITE, parent, tr, + &ftrace_set_event_notrace_pid_fops); if (!entry) pr_warn("Could not create tracefs 'set_event_notrace_pid' entry\n");
/* ring buffer internal formats */ - entry = trace_create_file("header_page", 0444, d_events, + entry = trace_create_file("header_page", TRACE_MODE_READ, d_events, ring_buffer_print_page_header, &ftrace_show_header_fops); if (!entry) pr_warn("Could not create tracefs 'header_page' entry\n");
- entry = trace_create_file("header_event", 0444, d_events, + entry = trace_create_file("header_event", TRACE_MODE_READ, d_events, ring_buffer_print_entry_header, &ftrace_show_header_fops); if (!entry) @@ -3468,8 +3469,8 @@ __init int event_trace_init(void) if (!tr) return -ENODEV;
- entry = tracefs_create_file("available_events", 0444, NULL, - tr, &ftrace_avail_fops); + entry = tracefs_create_file("available_events", TRACE_MODE_READ, + NULL, tr, &ftrace_avail_fops); if (!entry) pr_warn("Could not create tracefs 'available_events' entry\n");
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 881df991742a..88109dc2dfa5 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -2086,8 +2086,8 @@ static __init int trace_events_synth_init(void) if (err) goto err;
- entry = tracefs_create_file("synthetic_events", 0644, NULL, - NULL, &synth_events_fops); + entry = tracefs_create_file("synthetic_events", TRACE_MODE_WRITE, + NULL, NULL, &synth_events_fops); if (!entry) { err = -ENODEV; goto err; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index b086ba8bb3d6..677538c3b9ec 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1340,7 +1340,7 @@ static __init int init_graph_tracefs(void) if (ret) return 0;
- trace_create_file("max_graph_depth", 0644, NULL, + trace_create_file("max_graph_depth", TRACE_MODE_WRITE, NULL, NULL, &graph_depth_fops);
return 0; diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 4c01c5d8b9a7..fd5726c24d7f 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -544,14 +544,14 @@ static int init_tracefs(void) if (!top_dir) return -ENOMEM;
- hwlat_sample_window = tracefs_create_file("window", 0640, + hwlat_sample_window = tracefs_create_file("window", TRACE_MODE_WRITE, top_dir, &hwlat_data.sample_window, &window_fops); if (!hwlat_sample_window) goto err;
- hwlat_sample_width = tracefs_create_file("width", 0644, + hwlat_sample_width = tracefs_create_file("width", TRACE_MODE_WRITE, top_dir, &hwlat_data.sample_width, &width_fops); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 11242ff53663..d2780151f78e 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1928,16 +1928,16 @@ static __init int init_kprobe_trace(void) if (ret) return 0;
- entry = tracefs_create_file("kprobe_events", 0644, NULL, - NULL, &kprobe_events_ops); + entry = tracefs_create_file("kprobe_events", TRACE_MODE_WRITE, + NULL, NULL, &kprobe_events_ops);
/* Event list interface */ if (!entry) pr_warn("Could not create tracefs 'kprobe_events' entry\n");
/* Profile interface */ - entry = tracefs_create_file("kprobe_profile", 0444, NULL, - NULL, &kprobe_profile_ops); + entry = tracefs_create_file("kprobe_profile", TRACE_MODE_READ, + NULL, NULL, &kprobe_profile_ops);
if (!entry) pr_warn("Could not create tracefs 'kprobe_profile' entry\n"); diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 9980047834b5..3cdbe9ae6fe7 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1943,27 +1943,27 @@ static int init_tracefs(void) if (!top_dir) return 0;
- tmp = tracefs_create_file("period_us", 0640, top_dir, + tmp = tracefs_create_file("period_us", TRACE_MODE_WRITE, top_dir, &osnoise_period, &trace_min_max_fops); if (!tmp) goto err;
- tmp = tracefs_create_file("runtime_us", 0644, top_dir, + tmp = tracefs_create_file("runtime_us", TRACE_MODE_WRITE, top_dir, &osnoise_runtime, &trace_min_max_fops); if (!tmp) goto err;
- tmp = tracefs_create_file("stop_tracing_us", 0640, top_dir, + tmp = tracefs_create_file("stop_tracing_us", TRACE_MODE_WRITE, top_dir, &osnoise_stop_tracing_in, &trace_min_max_fops); if (!tmp) goto err;
- tmp = tracefs_create_file("stop_tracing_total_us", 0640, top_dir, + tmp = tracefs_create_file("stop_tracing_total_us", TRACE_MODE_WRITE, top_dir, &osnoise_stop_tracing_total, &trace_min_max_fops); if (!tmp) goto err;
- tmp = trace_create_file("cpus", 0644, top_dir, NULL, &cpus_fops); + tmp = trace_create_file("cpus", TRACE_MODE_WRITE, top_dir, NULL, &cpus_fops); if (!tmp) goto err; #ifdef CONFIG_TIMERLAT_TRACER diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index ff32476df072..697aeb3569fd 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -373,7 +373,7 @@ static __init int init_trace_printk_function_export(void) if (ret) return 0;
- trace_create_file("printk_formats", 0444, NULL, + trace_create_file("printk_formats", TRACE_MODE_READ, NULL, NULL, &ftrace_formats_fops);
return 0; diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index c408423e5d65..464fe0a7a032 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -560,14 +560,14 @@ static __init int stack_trace_init(void) if (ret) return 0;
- trace_create_file("stack_max_size", 0644, NULL, + trace_create_file("stack_max_size", TRACE_MODE_WRITE, NULL, &stack_trace_max_size, &stack_max_size_fops);
- trace_create_file("stack_trace", 0444, NULL, + trace_create_file("stack_trace", TRACE_MODE_READ, NULL, NULL, &stack_trace_fops);
#ifdef CONFIG_DYNAMIC_FTRACE - trace_create_file("stack_trace_filter", 0644, NULL, + trace_create_file("stack_trace_filter", TRACE_MODE_WRITE, NULL, &trace_ops, &stack_trace_filter_fops); #endif
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 8d141c3825a9..bb247beec447 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -297,9 +297,9 @@ static int init_stat_file(struct stat_session *session) if (!stat_dir && (ret = tracing_stat_init())) return ret;
- session->file = tracefs_create_file(session->ts->name, 0644, - stat_dir, - session, &tracing_stat_fops); + session->file = tracefs_create_file(session->ts->name, TRACE_MODE_WRITE, + stat_dir, session, + &tracing_stat_fops); if (!session->file) return -ENOMEM; return 0; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index a1bc68de1b29..a7e25b40ed34 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1652,10 +1652,10 @@ static __init int init_uprobe_trace(void) if (ret) return 0;
- trace_create_file("uprobe_events", 0644, NULL, + trace_create_file("uprobe_events", TRACE_MODE_WRITE, NULL, NULL, &uprobe_events_ops); /* Profile interface */ - trace_create_file("uprobe_profile", 0444, NULL, + trace_create_file("uprobe_profile", TRACE_MODE_READ, NULL, NULL, &uprobe_profile_ops); return 0; }
From: Daniel Bristot de Oliveira bristot@kernel.org
mainline inclusion from mainline-v5.16-rc1 commit ccb6754495ef253af5e1253434f0d21b6225c4ad category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4G64B CVE: NA
-------------------------------------------------
Remove CONFIG_TIMERLAT_TRACER from inside functions, avoiding compilation problems in the future.
Link: https://lkml.kernel.org/r/8245abb5a112d249f5da6c1df499244ad9e647bc.163570289...
Cc: Ingo Molnar mingo@redhat.com Cc: Tom Zanussi zanussi@kernel.org Cc: Masami Hiramatsu mhiramat@kernel.org Cc: Juri Lelli juri.lelli@redhat.com Cc: Clark Williams williams@redhat.com Cc: John Kacur jkacur@redhat.com Cc: Peter Zijlstra peterz@infradead.org Cc: Thomas Gleixner tglx@linutronix.de Cc: Sebastian Andrzej Siewior bigeasy@linutronix.de Cc: Daniel Bristot de Oliveira bristot@kernel.org Cc: linux-rt-users@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Suggested-by: Steven Rostedt rostedt@goodmis.org Signed-off-by: Daniel Bristot de Oliveira bristot@kernel.org Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/trace/trace_osnoise.c | 150 +++++++++++++++++++++++++---------- 1 file changed, 106 insertions(+), 44 deletions(-)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 3cdbe9ae6fe7..4881861a66d7 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -321,6 +321,57 @@ static struct osnoise_data { #endif };
+#ifdef CONFIG_TIMERLAT_TRACER +static inline bool timerlat_enabled(void) +{ + return osnoise_data.timerlat_tracer; +} + +static inline int timerlat_softirq_exit(struct osnoise_variables *osn_var) +{ + struct timerlat_variables *tlat_var = this_cpu_tmr_var(); + /* + * If the timerlat is enabled, but the irq handler did + * not run yet enabling timerlat_tracer, do not trace. + */ + if (!tlat_var->tracing_thread) { + osn_var->softirq.arrival_time = 0; + osn_var->softirq.delta_start = 0; + return 0; + } + return 1; +} + +static inline int timerlat_thread_exit(struct osnoise_variables *osn_var) +{ + struct timerlat_variables *tlat_var = this_cpu_tmr_var(); + /* + * If the timerlat is enabled, but the irq handler did + * not run yet enabling timerlat_tracer, do not trace. + */ + if (!tlat_var->tracing_thread) { + osn_var->thread.delta_start = 0; + osn_var->thread.arrival_time = 0; + return 0; + } + return 1; +} +#else /* CONFIG_TIMERLAT_TRACER */ +static inline bool timerlat_enabled(void) +{ + return false; +} + +static inline int timerlat_softirq_exit(struct osnoise_variables *osn_var) +{ + return 1; +} +static inline int timerlat_thread_exit(struct osnoise_variables *osn_var) +{ + return 1; +} +#endif + #ifdef CONFIG_PREEMPT_RT /* * Print the osnoise header info. @@ -958,21 +1009,9 @@ static void trace_softirq_exit_callback(void *data, unsigned int vec_nr) if (!osn_var->sampling) return;
-#ifdef CONFIG_TIMERLAT_TRACER - /* - * If the timerlat is enabled, but the irq handler did - * not run yet enabling timerlat_tracer, do not trace. - */ - if (unlikely(osnoise_data.timerlat_tracer)) { - struct timerlat_variables *tlat_var; - tlat_var = this_cpu_tmr_var(); - if (!tlat_var->tracing_thread) { - osn_var->softirq.arrival_time = 0; - osn_var->softirq.delta_start = 0; + if (unlikely(timerlat_enabled())) + if (!timerlat_softirq_exit(osn_var)) return; - } - } -#endif
duration = get_int_safe_duration(osn_var, &osn_var->softirq.delta_start); trace_softirq_noise(vec_nr, osn_var->softirq.arrival_time, duration); @@ -1066,17 +1105,9 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t) if (!osn_var->sampling) return;
-#ifdef CONFIG_TIMERLAT_TRACER - if (osnoise_data.timerlat_tracer) { - struct timerlat_variables *tlat_var; - tlat_var = this_cpu_tmr_var(); - if (!tlat_var->tracing_thread) { - osn_var->thread.delta_start = 0; - osn_var->thread.arrival_time = 0; + if (unlikely(timerlat_enabled())) + if (!timerlat_thread_exit(osn_var)) return; - } - } -#endif
duration = get_int_safe_duration(osn_var, &osn_var->thread.delta_start);
@@ -1577,6 +1608,11 @@ static int timerlat_main(void *data) hrtimer_cancel(&tlat->timer); return 0; } +#else /* CONFIG_TIMERLAT_TRACER */ +static int timerlat_main(void *data) +{ + return 0; +} #endif /* CONFIG_TIMERLAT_TRACER */
/* @@ -1619,16 +1655,13 @@ static int start_kthread(unsigned int cpu) void *main = osnoise_main; char comm[24];
-#ifdef CONFIG_TIMERLAT_TRACER - if (osnoise_data.timerlat_tracer) { + if (timerlat_enabled()) { snprintf(comm, 24, "timerlat/%d", cpu); main = timerlat_main; } else { snprintf(comm, 24, "osnoise/%d", cpu); } -#else - snprintf(comm, 24, "osnoise/%d", cpu); -#endif + kthread = kthread_create_on_cpu(main, NULL, cpu, comm);
if (IS_ERR(kthread)) { @@ -1922,6 +1955,35 @@ static const struct file_operations cpus_fops = { .llseek = generic_file_llseek, };
+#ifdef CONFIG_TIMERLAT_TRACER +/* + * init_timerlat_tracefs - A function to initialize the timerlat interface files + */ +static int init_timerlat_tracefs(struct dentry *top_dir) +{ + struct dentry *tmp; + +#ifdef CONFIG_STACKTRACE + tmp = tracefs_create_file("print_stack", TRACE_MODE_WRITE, top_dir, + &osnoise_print_stack, &trace_min_max_fops); + if (!tmp) + return -ENOMEM; +#endif + + tmp = tracefs_create_file("timerlat_period_us", TRACE_MODE_WRITE, top_dir, + &timerlat_period, &trace_min_max_fops); + if (!tmp) + return -ENOMEM; + + return 0; +} +#else /* CONFIG_TIMERLAT_TRACER */ +static int init_timerlat_tracefs(struct dentry *top_dir) +{ + return 0; +} +#endif /* CONFIG_TIMERLAT_TRACER */ + /* * init_tracefs - A function to initialize the tracefs interface files * @@ -1966,19 +2028,10 @@ static int init_tracefs(void) tmp = trace_create_file("cpus", TRACE_MODE_WRITE, top_dir, NULL, &cpus_fops); if (!tmp) goto err; -#ifdef CONFIG_TIMERLAT_TRACER -#ifdef CONFIG_STACKTRACE - tmp = tracefs_create_file("print_stack", 0640, top_dir, - &osnoise_print_stack, &trace_min_max_fops); - if (!tmp) - goto err; -#endif
- tmp = tracefs_create_file("timerlat_period_us", 0640, top_dir, - &timerlat_period, &trace_min_max_fops); - if (!tmp) + ret = init_timerlat_tracefs(top_dir); + if (ret) goto err; -#endif
return 0;
@@ -2184,6 +2237,16 @@ static struct tracer timerlat_tracer __read_mostly = { .print_header = print_timerlat_headers, .allow_instances = true, }; + +__init static int init_timerlat_tracer(void) +{ + return register_tracer(&timerlat_tracer); +} +#else /* CONFIG_TIMERLAT_TRACER */ +__init static int init_timerlat_tracer(void) +{ + return 0; +} #endif /* CONFIG_TIMERLAT_TRACER */
__init static int init_osnoise_tracer(void) @@ -2200,13 +2263,12 @@ __init static int init_osnoise_tracer(void) return ret; }
-#ifdef CONFIG_TIMERLAT_TRACER - ret = register_tracer(&timerlat_tracer); + ret = init_timerlat_tracer(); if (ret) { - pr_err(BANNER "Error registering timerlat\n"); + pr_err(BANNER "Error registering timerlat!\n"); return ret; } -#endif + osnoise_init_hotplug_support();
INIT_LIST_HEAD_RCU(&osnoise_instances);
From: Daniel Bristot de Oliveira bristot@kernel.org
mainline inclusion from mainline-v5.16-rc1 commit 2fac8d6486d5c34e2ec7028580142b8209da3f92 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4G64B CVE: NA
-------------------------------------------------
Currently, the user can start only one instance of timerlat/osnoise tracers and the tracers cannot run in parallel.
As starting point to add more flexibility, let's allow the same tracer to run on different trace instances. The workload will start when the first trace_array (instance) is registered and stop when the last instance is unregistered.
So, while this patch allows the same tracer to run in multiple instances (e.g., two instances running osnoise), it still does not allow instances of timerlat and osnoise in parallel (e.g., one timerlat and osnoise). That is because the osnoise: events have different behavior depending on which tracer is enabled (osnoise or timerlat). Enabling the parallel usage of these two tracers is in my TODO list.
Link: https://lkml.kernel.org/r/38c8f14b613492a4f3f938d9d3bf0b063b72f0f0.163570289...
Cc: Ingo Molnar mingo@redhat.com Cc: Tom Zanussi zanussi@kernel.org Cc: Masami Hiramatsu mhiramat@kernel.org Cc: Juri Lelli juri.lelli@redhat.com Cc: Clark Williams williams@redhat.com Cc: John Kacur jkacur@redhat.com Cc: Peter Zijlstra peterz@infradead.org Cc: Thomas Gleixner tglx@linutronix.de Cc: Sebastian Andrzej Siewior bigeasy@linutronix.de Cc: Daniel Bristot de Oliveira bristot@kernel.org Cc: linux-rt-users@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira bristot@kernel.org Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/trace/trace_osnoise.c | 101 +++++++++++++++++++++++++++-------- 1 file changed, 78 insertions(+), 23 deletions(-)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 4881861a66d7..b949631bc843 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -64,6 +64,24 @@ static bool osnoise_has_registered_instances(void) list); }
+/* + * osnoise_instance_registered - check if a tr is already registered + */ +static int osnoise_instance_registered(struct trace_array *tr) +{ + struct osnoise_instance *inst; + int found = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(inst, &osnoise_instances, list) { + if (inst->tr == tr) + found = 1; + } + rcu_read_unlock(); + + return found; +} + /* * osnoise_register_instance - register a new trace instance * @@ -2079,6 +2097,16 @@ static int osnoise_workload_start(void) { int retval;
+ /* + * Instances need to be registered after calling workload + * start. Hence, if there is already an instance, the + * workload was already registered. Otherwise, this + * code is on the way to register the first instance, + * and the workload will start. + */ + if (osnoise_has_registered_instances()) + return 0; + osn_var_reset_all();
retval = osnoise_hook_events(); @@ -2106,6 +2134,13 @@ static int osnoise_workload_start(void) */ static void osnoise_workload_stop(void) { + /* + * Instances need to be unregistered before calling + * stop. Hence, if there is a registered instance, more + * than one instance is running, and the workload will not + * yet stop. Otherwise, this code is on the way to disable + * the last instance, and the workload can stop. + */ if (osnoise_has_registered_instances()) return;
@@ -2127,7 +2162,11 @@ static void osnoise_tracer_start(struct trace_array *tr) { int retval;
- if (osnoise_has_registered_instances()) + /* + * If the instance is already registered, there is no need to + * register it again. + */ + if (osnoise_instance_registered(tr)) return;
retval = osnoise_workload_start(); @@ -2139,18 +2178,17 @@ static void osnoise_tracer_start(struct trace_array *tr)
static void osnoise_tracer_stop(struct trace_array *tr) { - if (!osnoise_has_registered_instances()) - return; - osnoise_unregister_instance(tr); osnoise_workload_stop(); }
static int osnoise_tracer_init(struct trace_array *tr) { - - /* Only allow one instance to enable this */ - if (osnoise_has_registered_instances()) + /* + * Only allow osnoise tracer if timerlat tracer is not running + * already. + */ + if (timerlat_enabled()) return -EBUSY;
tr->max_latency = 0; @@ -2179,45 +2217,55 @@ static void timerlat_tracer_start(struct trace_array *tr) { int retval;
- if (osnoise_has_registered_instances()) + /* + * If the instance is already registered, there is no need to + * register it again. + */ + if (osnoise_instance_registered(tr)) return;
- osnoise_data.timerlat_tracer = 1; - retval = osnoise_workload_start(); if (retval) - goto out_err; + pr_err(BANNER "Error starting timerlat tracer\n");
osnoise_register_instance(tr);
return; -out_err: - pr_err(BANNER "Error starting timerlat tracer\n"); }
static void timerlat_tracer_stop(struct trace_array *tr) { int cpu;
- if (!osnoise_has_registered_instances()) - return; - - for_each_online_cpu(cpu) - per_cpu(per_cpu_osnoise_var, cpu).sampling = 0; + osnoise_unregister_instance(tr);
- osnoise_tracer_stop(tr); + /* + * Instruct the threads to stop only if this is the last instance. + */ + if (!osnoise_has_registered_instances()) { + for_each_online_cpu(cpu) + per_cpu(per_cpu_osnoise_var, cpu).sampling = 0; + }
- osnoise_data.timerlat_tracer = 0; + osnoise_workload_stop(); }
static int timerlat_tracer_init(struct trace_array *tr) { - /* Only allow one instance to enable this */ - if (osnoise_has_registered_instances()) + /* + * Only allow timerlat tracer if osnoise tracer is not running already. + */ + if (osnoise_has_registered_instances() && !osnoise_data.timerlat_tracer) return -EBUSY;
- tr->max_latency = 0; + /* + * If this is the first instance, set timerlat_tracer to block + * osnoise tracer start. + */ + if (!osnoise_has_registered_instances()) + osnoise_data.timerlat_tracer = 1;
+ tr->max_latency = 0; timerlat_tracer_start(tr);
return 0; @@ -2226,6 +2274,13 @@ static int timerlat_tracer_init(struct trace_array *tr) static void timerlat_tracer_reset(struct trace_array *tr) { timerlat_tracer_stop(tr); + + /* + * If this is the last instance, reset timerlat_tracer allowing + * osnoise to be started. + */ + if (!osnoise_has_registered_instances()) + osnoise_data.timerlat_tracer = 0; }
static struct tracer timerlat_tracer __read_mostly = {
From: Daniel Bristot de Oliveira bristot@kernel.org
mainline inclusion from mainline-v5.16-rc1 commit b14f4568d391c3b9bda9c078a32977e3f939f020 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4G64B CVE: NA
-------------------------------------------------
Remove CONFIG_STACKTRACE from inside functions, avoiding compilation problems in the future.
Link: https://lkml.kernel.org/r/3465cca2f28e1ba602a1fc8bdb28d12950b5226e.163570289...
Cc: Ingo Molnar mingo@redhat.com Cc: Tom Zanussi zanussi@kernel.org Cc: Masami Hiramatsu mhiramat@kernel.org Cc: Juri Lelli juri.lelli@redhat.com Cc: Clark Williams williams@redhat.com Cc: John Kacur jkacur@redhat.com Cc: Peter Zijlstra peterz@infradead.org Cc: Thomas Gleixner tglx@linutronix.de Cc: Sebastian Andrzej Siewior bigeasy@linutronix.de Cc: Daniel Bristot de Oliveira bristot@kernel.org Cc: linux-rt-users@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Suggested-by: Steven Rostedt rostedt@goodmis.org Signed-off-by: Daniel Bristot de Oliveira bristot@kernel.org Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/trace/trace_osnoise.c | 44 ++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 15 deletions(-)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index b949631bc843..e77128f86e48 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -626,13 +626,19 @@ __timerlat_dump_stack(struct trace_buffer *buffer, struct trace_stack *fstack, u /* * timerlat_dump_stack - dump a stack trace previously saved */ -static void timerlat_dump_stack(void) +static void timerlat_dump_stack(u64 latency) { struct osnoise_instance *inst; struct trace_buffer *buffer; struct trace_stack *fstack; unsigned int size;
+ /* + * trace only if latency > print_stack config, if enabled. + */ + if (!osnoise_data.print_stack || osnoise_data.print_stack > latency) + return; + preempt_disable_notrace(); fstack = this_cpu_ptr(&trace_stack); size = fstack->stack_size; @@ -646,8 +652,8 @@ static void timerlat_dump_stack(void) rcu_read_unlock(); preempt_enable_notrace(); } -#else -#define timerlat_dump_stack() do {} while (0) +#else /* CONFIG_STACKTRACE */ +#define timerlat_dump_stack(u64 latency) do {} while (0) #define timerlat_save_stack(a) do {} while (0) #endif /* CONFIG_STACKTRACE */ #endif /* CONFIG_TIMERLAT_TRACER */ @@ -1609,11 +1615,7 @@ static int timerlat_main(void *data)
trace_timerlat_sample(&s);
-#ifdef CONFIG_STACKTRACE - if (osnoise_data.print_stack) - if (osnoise_data.print_stack <= time_to_us(diff)) - timerlat_dump_stack(); -#endif /* CONFIG_STACKTRACE */ + timerlat_dump_stack(time_to_us(diff));
tlat->tracing_thread = false; if (osnoise_data.stop_tracing_total) @@ -1974,26 +1976,38 @@ static const struct file_operations cpus_fops = { };
#ifdef CONFIG_TIMERLAT_TRACER -/* - * init_timerlat_tracefs - A function to initialize the timerlat interface files - */ -static int init_timerlat_tracefs(struct dentry *top_dir) +#ifdef CONFIG_STACKTRACE +static int init_timerlat_stack_tracefs(struct dentry *top_dir) { struct dentry *tmp;
-#ifdef CONFIG_STACKTRACE tmp = tracefs_create_file("print_stack", TRACE_MODE_WRITE, top_dir, &osnoise_print_stack, &trace_min_max_fops); if (!tmp) return -ENOMEM; -#endif + + return 0; +} +#else /* CONFIG_STACKTRACE */ +static int init_timerlat_stack_tracefs(struct dentry *top_dir) +{ + return 0; +} +#endif /* CONFIG_STACKTRACE */ + +/* + * init_timerlat_tracefs - A function to initialize the timerlat interface files + */ +static int init_timerlat_tracefs(struct dentry *top_dir) +{ + struct dentry *tmp;
tmp = tracefs_create_file("timerlat_period_us", TRACE_MODE_WRITE, top_dir, &timerlat_period, &trace_min_max_fops); if (!tmp) return -ENOMEM;
- return 0; + return init_timerlat_stack_tracefs(top_dir); } #else /* CONFIG_TIMERLAT_TRACER */ static int init_timerlat_tracefs(struct dentry *top_dir)
From: Daniel Bristot de Oliveira bristot@kernel.org
mainline inclusion from mainline-v5.16-rc1 commit 01e181c776fddf3a9e7a2ef229cc6e7ddf126fe7 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4G64B CVE: NA
-------------------------------------------------
Remove CONFIG_PREEMPT_RT from inside functions, avoiding compilation problems in the future.
Link: https://lkml.kernel.org/r/37ee0881b033cdc513efc84ebea26cf77880c8c2.163570289...
Cc: Ingo Molnar mingo@redhat.com Cc: Tom Zanussi zanussi@kernel.org Cc: Masami Hiramatsu mhiramat@kernel.org Cc: Juri Lelli juri.lelli@redhat.com Cc: Clark Williams williams@redhat.com Cc: John Kacur jkacur@redhat.com Cc: Peter Zijlstra peterz@infradead.org Cc: Thomas Gleixner tglx@linutronix.de Cc: Sebastian Andrzej Siewior bigeasy@linutronix.de Cc: Daniel Bristot de Oliveira bristot@kernel.org Cc: linux-rt-users@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Suggested-by: Steven Rostedt rostedt@goodmis.org Signed-off-by: Daniel Bristot de Oliveira bristot@kernel.org Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/trace/trace_osnoise.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index e77128f86e48..e9074dbcf5fb 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1495,9 +1495,11 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer) * running, the thread needs to receive the softirq delta_start. The * reason being is that the softirq will be the last to be unfolded, * resseting the thread delay to zero. + * + * The PREEMPT_RT is a special case, though. As softirqs run as threads + * on RT, moving the thread is enough. */ -#ifndef CONFIG_PREEMPT_RT - if (osn_var->softirq.delta_start) { + if (!IS_ENABLED(CONFIG_PREEMPT_RT) && osn_var->softirq.delta_start) { copy_int_safe_time(osn_var, &osn_var->thread.delta_start, &osn_var->softirq.delta_start);
@@ -1507,13 +1509,6 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer) copy_int_safe_time(osn_var, &osn_var->thread.delta_start, &osn_var->irq.delta_start); } -#else /* CONFIG_PREEMPT_RT */ - /* - * The sofirqs run as threads on RT, so there is not need - * to keep track of it. - */ - copy_int_safe_time(osn_var, &osn_var->thread.delta_start, &osn_var->irq.delta_start); -#endif /* CONFIG_PREEMPT_RT */
/* * Compute the current time with the expected time.
From: Daniel Bristot de Oliveira bristot@kernel.org
mainline inclusion from mainline-v5.16-rc1 commit d7458bc0d8b409460713228d2ed279addb38947a category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4G64B CVE: NA
-------------------------------------------------
Make the struct list_head osnoise_instances definition static.
Link: https://lore.kernel.org/all/202111120052.ZuikQSJi-lkp@intel.com/ Link: https://lkml.kernel.org/r/d001f0eeac66e2b2eeec7d2a15e9e7abede0453a.163666797...
Cc: Ingo Molnar mingo@redhat.com Fixes: dae181349f1e ("tracing/osnoise: Support a list of trace_array *tr") Reported-by: kernel test robot lkp@intel.com Signed-off-by: Daniel Bristot de Oliveira bristot@kernel.org Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/trace/trace_osnoise.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index e9074dbcf5fb..a32951f892cc 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -55,7 +55,8 @@ struct osnoise_instance { struct list_head list; struct trace_array *tr; }; -struct list_head osnoise_instances; + +static struct list_head osnoise_instances;
static bool osnoise_has_registered_instances(void) {
From: "Steven Rostedt (VMware)" rostedt@goodmis.org
mainline inclusion from mainline-v5.14-rc1 commit b96285e10aad234acfa0628f7e8336990f778c03 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4G64B CVE: NA
-------------------------------------------------
ftracetest triggered:
INFO: rcu_tasks detected stalls on tasks: 00000000b92b832d: .. nvcsw: 1/1 holdout: 1 idle_cpu: -1/7 task:osnoise/7 state:R running task stack: 0 pid: 2133 ppid: 2 flags:0x00004000 Call Trace: ? asm_sysvec_apic_timer_interrupt+0x12/0x20 ? asm_sysvec_apic_timer_interrupt+0x12/0x20 ? trace_hardirqs_on+0x2b/0xe0 ? asm_sysvec_apic_timer_interrupt+0x12/0x20 ? trace_clock_local+0xc/0x20 ? osnoise_main+0x10e/0x450 ? trace_softirq_entry_callback+0x50/0x50 ? kthread+0x153/0x170 ? __kthread_bind_mask+0x60/0x60 ? ret_from_fork+0x22/0x30
While running osnoise tracer with other tracers that rely on synchronize_rcu_tasks(), where that just hung.
The reason is that osnoise_main() never schedules out if the interval is less than 1, and this will cause synchronize_rcu_tasks() to never return.
Link: https://lkml.kernel.org/r/20210628114953.6dc06a91@oasis.local.home
Fixes: bce29ac9ce0bb ("trace: Add osnoise tracer") Acked-by: Paul E. McKenney paulmck@kernel.org Reviewed-by: Daniel Bristot de Oliveira bristot@redhat.com Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/trace/trace_osnoise.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index a32951f892cc..545c3d6489bd 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1441,8 +1441,11 @@ static int osnoise_main(void *data) * differently from hwlat_detector, the osnoise tracer can run * without a pause because preemption is on. */ - if (interval < 1) + if (interval < 1) { + /* Let synchronize_rcu_tasks() make progress */ + cond_resched_tasks_rcu_qs(); continue; + }
if (msleep_interruptible(interval)) break;
From: Wang ShaoBo bobo.shaobowang@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4G64B CVE: NA
-------------------------------------------------
Enable CONFIG_OSNOISE_TRACER and CONFIG_TIMERLAT_TRACER by default y.
Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/configs/openeuler_defconfig | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 83691c2ad677..63436eaa1ada 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -7149,3 +7149,5 @@ CONFIG_NET_VENDOR_RAMAXEL=y CONFIG_SPNIC=m CONFIG_SPFC=m CONFIG_GPIO_HISI=y +CONFIG_OSNOISE_TRACER=y +CONFIG_TIMERLAT_TRACER=y
From: Wang ShaoBo bobo.shaobowang@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4G64B CVE: NA
-------------------------------------------------
This patch fixes following compile warning:
---------------------- [WARNING] checkbuild_arm64 ----------------------
build warning: arm64, allmodconfig
kernel/trace/trace_osnoise.c: In function ‘start_kthread’: kernel/trace/trace_osnoise.c:1674:8: warning: ‘main’ is usually a function [-Wmain] void *main = osnoise_main; ^~~~ ---------------------- [WARNING] checkbuild_x86_64 ----------------------
build warning: x86_64, allmodconfig
kernel/trace/trace_osnoise.c: In function ‘start_kthread’: kernel/trace/trace_osnoise.c:1674:8: warning: ‘main’ is usually a function [-Wmain] void *main = osnoise_main; ^~~~
Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/trace/trace_osnoise.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 545c3d6489bd..cfb80feb291e 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1671,17 +1671,17 @@ static void stop_per_cpu_kthreads(void) static int start_kthread(unsigned int cpu) { struct task_struct *kthread; - void *main = osnoise_main; + void *func_main = osnoise_main; char comm[24];
if (timerlat_enabled()) { snprintf(comm, 24, "timerlat/%d", cpu); - main = timerlat_main; + func_main = timerlat_main; } else { snprintf(comm, 24, "osnoise/%d", cpu); }
- kthread = kthread_create_on_cpu(main, NULL, cpu, comm); + kthread = kthread_create_on_cpu(func_main, NULL, cpu, comm);
if (IS_ERR(kthread)) { pr_err(BANNER "could not start sampling thread\n");
From: Yu Liao liaoyu15@huawei.com
mainline-inclusion from mainline-v5.16-rc6 commit 4e8c11b6b3f0b6a283e898344f154641eda94266 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4OJMW CVE: NA
-------------------------------------------------
Even after commit e1d7ba873555 ("time: Always make sure wall_to_monotonic isn't positive") it is still possible to make wall_to_monotonic positive by running the following code:
int main(void) { struct timespec time;
clock_gettime(CLOCK_MONOTONIC, &time); time.tv_nsec = 0; clock_settime(CLOCK_REALTIME, &time); return 0; }
The reason is that the second parameter of timespec64_compare(), ts_delta, may be unnormalized because the delta is calculated with an open coded substraction which causes the comparison of tv_sec to yield the wrong result:
wall_to_monotonic = { .tv_sec = -10, .tv_nsec = 900000000 } ts_delta = { .tv_sec = -9, .tv_nsec = -900000000 }
That makes timespec64_compare() claim that wall_to_monotonic < ts_delta, but actually the result should be wall_to_monotonic > ts_delta.
After normalization, the result of timespec64_compare() is correct because the tv_sec comparison is not longer misleading:
wall_to_monotonic = { .tv_sec = -10, .tv_nsec = 900000000 } ts_delta = { .tv_sec = -10, .tv_nsec = 100000000 }
Use timespec64_sub() to ensure that ts_delta is normalized, which fixes the issue.
Fixes: e1d7ba873555 ("time: Always make sure wall_to_monotonic isn't positive") Signed-off-by: Yu Liao liaoyu15@huawei.com Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20211213135727.1656662-1-liaoyu15@huawei.com Reviewed-by: Xiongfeng Wang wangxiongfeng2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/time/timekeeping.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 6858a31364b6..cc4dc2857a87 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1310,8 +1310,7 @@ int do_settimeofday64(const struct timespec64 *ts) timekeeping_forward_now(tk);
xt = tk_xtime(tk); - ts_delta.tv_sec = ts->tv_sec - xt.tv_sec; - ts_delta.tv_nsec = ts->tv_nsec - xt.tv_nsec; + ts_delta = timespec64_sub(*ts, xt);
if (timespec64_compare(&tk->wall_to_monotonic, &ts_delta) > 0) { ret = -EINVAL;
From: Wang Wensheng wangwensheng4@huawei.com
ascend inclusion category: Feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NDAW CVE: NA
-------------------
Introduce function hugetlb_alloc_hugepage that alloc hugepages from static hugepages first. When the static hugepage is used up, it attempts to apply for hugepages from buddy system. Two additional modes are supported: static hugepages only and buddy hugepages only.
When the driver gets huge pages by alloc_huge_page_node, it attempts to apply for migrate hugepages after the reserved memory hugepages are used up. We expect that the migrated hugepages that are applied for can be charged in memcg to limit the memory usage. So we enable charge migrate hugepages, and default enable it.
Add hugetlb_insert_hugepage_pte[_by_pa] to insert hugepages into page table. The by_pa version performs like remap_pfn_range() that make the pte special and can be used for reserved physical memory.
Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Signed-off-by: Zhang Jian zhangjian210@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/Kconfig | 12 ++++ include/linux/hugetlb.h | 62 ++++++++++++++++++ mm/hugetlb.c | 139 +++++++++++++++++++++++++++++++++++++++- 3 files changed, 212 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 3afabc81551c..85610dd3ac0c 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2022,6 +2022,18 @@ config ASCEND_OOM 0: disable oom killer 1: enable oom killer (default,compatible with mainline)
+config ASCEND_CHARGE_MIGRATE_HUGEPAGES + bool "Enable support for migrate hugepages" + depends on HUGETLBFS + default y + help + When reserved hugepages are used up, we attempts to apply for migrate + hugepages. We expect that the migrated hugepages that are applied for + can be charged in memcg to limit the memory usage. + + This option enable the feature to charge migrate hugepages to memory + cgroup. + endif
endmenu diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index fd9635a6a92f..397e6bfa8268 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -607,6 +607,45 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, int huge_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx);
+#ifdef CONFIG_ASCEND_FEATURES +#define HUGETLB_ALLOC_NONE 0x00 +#define HUGETLB_ALLOC_NORMAL 0x01 /* normal hugepage */ +#define HUGETLB_ALLOC_BUDDY 0x02 /* buddy hugepage */ +#define HUGETLB_ALLOC_MASK (HUGETLB_ALLOC_NONE | \ + HUGETLB_ALLOC_NORMAL | \ + HUGETLB_ALLOC_BUDDY) + +const struct hstate *hugetlb_get_hstate(void); +struct page *hugetlb_alloc_hugepage(int nid, int flag); +int hugetlb_insert_hugepage_pte(struct mm_struct *mm, unsigned long addr, + pgprot_t prot, struct page *hpage); +int hugetlb_insert_hugepage_pte_by_pa(struct mm_struct *mm, + unsigned long vir_addr, + pgprot_t prot, unsigned long phy_addr); +#else +static inline const struct hstate *hugetlb_get_hstate(void) +{ + return NULL; +} + +static inline struct page *hugetlb_alloc_hugepage(int nid, int flag) +{ + return NULL; +} + +static inline int hugetlb_insert_hugepage_pte(struct mm_struct *mm, + unsigned long addr, pgprot_t prot, struct page *hpage) +{ + return -EPERM; +} +static inline int hugetlb_insert_hugepage_pte_by_pa(struct mm_struct *mm, + unsigned long vir_addr, + pgprot_t prot, unsigned long phy_addr) +{ + return -EPERM; +} +#endif + /* arch callback */ int __init __alloc_bootmem_huge_page(struct hstate *h); int __init alloc_bootmem_huge_page(struct hstate *h); @@ -1028,6 +1067,29 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr pte_t *ptep, pte_t pte, unsigned long sz) { } + +static inline const struct hstate *hugetlb_get_hstate(void) +{ + return NULL; +} + +static inline struct page *hugetlb_alloc_hugepage(int nid, int flag) +{ + return NULL; +} + +static inline int hugetlb_insert_hugepage_pte(struct mm_struct *mm, + unsigned long addr, pgprot_t prot, struct page *hpage) +{ + return -EPERM; +} + +static inline int hugetlb_insert_hugepage_pte_by_pa(struct mm_struct *mm, + unsigned long vir_addr, + pgprot_t prot, unsigned long phy_addr) +{ + return -EPERM; +} #endif /* CONFIG_HUGETLB_PAGE */
#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d0672e482879..eaddb18d58e1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -67,7 +67,6 @@ static struct hstate * __initdata parsed_hstate; static unsigned long __initdata default_hstate_max_huge_pages; static bool __initdata parsed_valid_hugepagesz = true; static bool __initdata parsed_default_hugepagesz; - /* * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, * free_huge_pages, and surplus_huge_pages. @@ -5907,3 +5906,141 @@ void __init hugetlb_cma_check(void) }
#endif /* CONFIG_CMA */ + +#ifdef CONFIG_ASCEND_FEATURES +static int enable_charge_mighp __read_mostly; + +const struct hstate *hugetlb_get_hstate(void) +{ + return &default_hstate; +} +EXPORT_SYMBOL_GPL(hugetlb_get_hstate); + +static struct page *hugetlb_alloc_hugepage_normal(struct hstate *h, + gfp_t gfp_mask, int nid) +{ + struct page *page = NULL; + + spin_lock(&hugetlb_lock); + if (h->free_huge_pages - h->resv_huge_pages > 0) + page = dequeue_huge_page_nodemask(h, gfp_mask, nid, NULL, NULL); + spin_unlock(&hugetlb_lock); + + return page; +} + +/* + * Allocate hugepage without reserve + */ +struct page *hugetlb_alloc_hugepage(int nid, int flag) +{ + struct hstate *h = &default_hstate; + gfp_t gfp_mask = htlb_alloc_mask(h); + struct page *page = NULL; + + if (nid == NUMA_NO_NODE) + nid = numa_mem_id(); + + if (nid < 0 || nid >= MAX_NUMNODES) + return NULL; + + if (flag & ~HUGETLB_ALLOC_MASK) + return NULL; + + gfp_mask |= __GFP_THISNODE; + if (enable_charge_mighp) + gfp_mask |= __GFP_ACCOUNT; + + if (flag & HUGETLB_ALLOC_NORMAL) + page = hugetlb_alloc_hugepage_normal(h, gfp_mask, nid); + else if (flag & HUGETLB_ALLOC_BUDDY) + page = alloc_migrate_huge_page(h, gfp_mask, nid, NULL); + else + page = alloc_huge_page_nodemask(h, nid, NULL, gfp_mask); + + return page; +} +EXPORT_SYMBOL_GPL(hugetlb_alloc_hugepage); + +static int __hugetlb_insert_hugepage(struct mm_struct *mm, unsigned long addr, + pgprot_t prot, unsigned long pfn, bool special) +{ + int ret = 0; + pte_t *ptep, entry; + struct hstate *h; + struct vm_area_struct *vma; + struct address_space *mapping; + spinlock_t *ptl; + + h = size_to_hstate(PMD_SIZE); + if (!h) + return -EINVAL; + + if (!IS_ALIGNED(addr, PMD_SIZE)) + return -EINVAL; + + vma = find_vma(mm, addr); + if (!vma || !range_in_vma(vma, addr, addr + PMD_SIZE)) + return -EINVAL; + + mapping = vma->vm_file->f_mapping; + i_mmap_lock_read(mapping); + ptep = huge_pte_alloc(mm, addr, huge_page_size(h)); + if (!ptep) { + ret = -ENXIO; + goto out_unlock; + } + + if (WARN_ON(ptep && !pte_none(*ptep) && !pmd_huge(*(pmd_t *)ptep))) { + ret = -ENXIO; + goto out_unlock; + } + + entry = pfn_pte(pfn, prot); + entry = huge_pte_mkdirty(entry); + if (!(pgprot_val(prot) & PTE_RDONLY)) + entry = huge_pte_mkwrite(entry); + entry = pte_mkyoung(entry); + entry = pte_mkhuge(entry); + if (special) + entry = pte_mkspecial(entry); + + ptl = huge_pte_lockptr(h, mm, ptep); + spin_lock(ptl); + set_huge_pte_at(mm, addr, ptep, entry); + spin_unlock(ptl); + +out_unlock: + i_mmap_unlock_read(mapping); + + return ret; +} + +int hugetlb_insert_hugepage_pte(struct mm_struct *mm, unsigned long addr, + pgprot_t prot, struct page *hpage) +{ + return __hugetlb_insert_hugepage(mm, addr, prot, page_to_pfn(hpage), false); +} +EXPORT_SYMBOL_GPL(hugetlb_insert_hugepage_pte); + +int hugetlb_insert_hugepage_pte_by_pa(struct mm_struct *mm, unsigned long addr, + pgprot_t prot, unsigned long phy_addr) +{ + return __hugetlb_insert_hugepage(mm, addr, prot, phy_addr >> PAGE_SHIFT, true); +} +EXPORT_SYMBOL_GPL(hugetlb_insert_hugepage_pte_by_pa); + +#ifdef CONFIG_ASCEND_CHARGE_MIGRATE_HUGEPAGES + +static int __init ascend_enable_charge_migrate_hugepages(char *s) +{ + enable_charge_mighp = 1; + + pr_info("Ascend enable charge migrate hugepage\n"); + + return 1; +} +__setup("enable_charge_mighp", ascend_enable_charge_migrate_hugepages); + +#endif +#endif
From: Wang Wensheng wangwensheng4@huawei.com
ascend inclusion category: Feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NDAW CVE: NA
-------------------
Some devices could not handle mixed levels of page table. They want to know exactly if the memory thay alloced is hugepages or not. Introduce vmalloc/vmap/remap interfaces that handle only hugepages.
Introduce VM_HUGE_PAGES flag. __vmalloc_node_range() would alloc PMD_SIZE hugepages only if specifying VM_HUGE_PAGES.
Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/mm.h | 6 ++ include/linux/vmalloc.h | 11 +++ mm/vmalloc.c | 203 +++++++++++++++++++++++++++++++++++++++- 3 files changed, 218 insertions(+), 2 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h index e30344cd6291..f8ae3e41d5db 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -233,6 +233,12 @@ int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *, /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */ #define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE)
+/* to align the pointer to the (next) PMD hugepage boundary */ +#define PMD_ALIGN(addr) ALIGN(addr, PMD_SIZE) + +/* test whether an address (unsigned long or pointer) is aligned to PMD_SIZE */ +#define PMD_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PMD_SIZE) + #define lru_to_page(head) (list_entry((head)->prev, struct page, lru))
/* diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 7ddd312efce4..8f9e13944cc7 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -27,6 +27,7 @@ struct notifier_block; /* in notifier.h */ #define VM_FLUSH_RESET_PERMS 0x00000100 /* reset direct map and flush TLB on unmap, can't be freed in atomic context */ #define VM_MAP_PUT_PAGES 0x00000200 /* put pages and free array in vfree */ #define VM_NO_HUGE_VMAP 0x00000400 /* force PAGE_SIZE pte mapping */ +#define VM_HUGE_PAGES 0x00001000 /* used for vmalloc hugepages */
/* * VM_KASAN is used slighly differently depending on CONFIG_KASAN_VMALLOC. @@ -136,6 +137,8 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align, void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, int node, const void *caller); void *vmalloc_no_huge(unsigned long size); +extern void *vmalloc_hugepage(unsigned long size); +extern void *vmalloc_hugepage_user(unsigned long size);
extern void vfree(const void *addr); extern void vfree_atomic(const void *addr); @@ -152,6 +155,14 @@ extern int remap_vmalloc_range_partial(struct vm_area_struct *vma, extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, unsigned long pgoff);
+extern void *vmap_hugepage(struct page **pages, unsigned int count, + unsigned long flags, pgprot_t prot); +extern int remap_vmalloc_hugepage_range_partial(struct vm_area_struct *vma, + unsigned long uaddr, void *kaddr, + unsigned long pgoff, unsigned long size); +extern int remap_vmalloc_hugepage_range(struct vm_area_struct *vma, + void *addr, unsigned long pgoff); + /* * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values * and let generic vmalloc and ioremap code know when arch_sync_kernel_mappings() diff --git a/mm/vmalloc.c b/mm/vmalloc.c index aa46f5028d17..755be0c19c81 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -36,6 +36,8 @@ #include <linux/overflow.h> #include <linux/pgtable.h> #include <linux/uaccess.h> +#include <linux/hugetlb.h> +#include <asm/io.h> #include <asm/tlbflush.h> #include <asm/shmparam.h>
@@ -575,6 +577,38 @@ static int vmap_pages_range(unsigned long addr, unsigned long end, return err; }
+static int vmap_hugepages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift) +{ + unsigned int i, nr = (end - addr) >> page_shift; + + for (i = 0; i < nr; i++) { + int err; + + err = vmap_range_noflush(addr, addr + (1UL << page_shift), + __pa(page_address(pages[i])), prot, + page_shift); + if (err) + return err; + + addr += 1UL << page_shift; + } + + return 0; +} + +static int vmap_hugepages_range(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, + unsigned int page_shift) +{ + int err; + + err = vmap_hugepages_range_noflush(addr, end, prot, pages, page_shift); + flush_cache_vmap(addr, end); + + return err; +} + /** * map_kernel_range_noflush - map kernel VM area with the specified pages * @addr: start of the VM area to map @@ -2749,6 +2783,45 @@ void *vmap(struct page **pages, unsigned int count, } EXPORT_SYMBOL(vmap);
+/** + * vmap_hugepage - map an array of huge pages into virtually contiguous space + * @pages: array of huge page pointers (only the header) + * @count: number of pages to map + * @flags: vm_area->flags + * @prot: page protection for the mapping + * + * Maps @count pages from @pages into contiguous kernel virtual + * space. + */ +void *vmap_hugepage(struct page **pages, unsigned int count, + unsigned long flags, pgprot_t prot) +{ + struct vm_struct *area; + unsigned long size; /* In bytes */ + + might_sleep(); + + if (count > totalram_pages()) + return NULL; + + size = (unsigned long)count << PMD_SHIFT; + area = __get_vm_area_node(size, PMD_SIZE, PMD_SHIFT, flags | VM_HUGE_PAGES, + VMALLOC_START, VMALLOC_END, + NUMA_NO_NODE, GFP_KERNEL, __builtin_return_address(0)); + if (!area) + return NULL; + + if (vmap_hugepages_range((unsigned long)area->addr, + (unsigned long)area->addr + size, prot, + pages, PMD_SHIFT) < 0) { + vunmap(area->addr); + return NULL; + } + + return area->addr; +} +EXPORT_SYMBOL(vmap_hugepage); + #ifdef CONFIG_VMAP_PFN struct vmap_pfn_data { unsigned long *pfns; @@ -2933,7 +3006,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, size_per_node = size; if (node == NUMA_NO_NODE) size_per_node /= num_online_nodes(); - if (size_per_node >= PMD_SIZE) { + if (size_per_node >= PMD_SIZE || vm_flags & VM_HUGE_PAGES) { shift = PMD_SHIFT; align = max(real_align, 1UL << shift); size = ALIGN(real_size, 1UL << shift); @@ -2968,7 +3041,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, return addr;
fail: - if (shift > PAGE_SHIFT) { + /* User could specify VM_HUGE_PAGES to alloc only hugepages. */ + if (shift > PAGE_SHIFT && !(vm_flags & VM_HUGE_PAGES)) { shift = PAGE_SHIFT; align = real_align; size = real_size; @@ -3177,6 +3251,44 @@ void *vmalloc_32_user(unsigned long size) } EXPORT_SYMBOL(vmalloc_32_user);
+/** + * vmalloc_hugepage - allocate virtually contiguous hugetlb memory + * @size: allocation size + * + * Allocate enough huge pages to cover @size and map them into + * contiguous kernel virtual space. + * + * The allocation size is aligned to PMD_SIZE automatically + */ +void *vmalloc_hugepage(unsigned long size) +{ + return __vmalloc_node_range(size, PMD_SIZE, VMALLOC_START, VMALLOC_END, + GFP_KERNEL, PAGE_KERNEL, + VM_HUGE_PAGES, NUMA_NO_NODE, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(vmalloc_hugepage); + +/** + * vmalloc_hugepage_user - allocate virtually contiguous hugetlb memory + * for userspace + * @size: allocation size + * + * Allocate enough huge pages to cover @size and map them into + * contiguous kernel virtual space. The resulting memory area + * is zeroed so it can be mapped to userspace without leaking data. + * + * The allocation size is aligned to PMD_SIZE automatically + */ +void *vmalloc_hugepage_user(unsigned long size) +{ + return __vmalloc_node_range(size, PMD_SIZE, VMALLOC_START, VMALLOC_END, + GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, + VM_USERMAP | VM_HUGE_PAGES, NUMA_NO_NODE, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(vmalloc_hugepage_user); + /* * small helper routine , copy contents to buf from addr. * If the page is not present, fill zero. @@ -3498,6 +3610,93 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, } EXPORT_SYMBOL(remap_vmalloc_range);
+/** + * remap_vmalloc_hugepage_range_partial - map vmalloc hugepages + * to userspace + * @vma: vma to cover + * @uaddr: target user address to start at + * @kaddr: virtual address of vmalloc hugepage kernel memory + * @size: size of map area + * + * Returns: 0 for success, -Exxx on failure + * + * This function checks that @kaddr is a valid vmalloc'ed area, + * and that it is big enough to cover the range starting at + * @uaddr in @vma. Will return failure if that criteria isn't + * met. + * + * Similar to remap_pfn_range() (see mm/memory.c) + */ +int remap_vmalloc_hugepage_range_partial(struct vm_area_struct *vma, unsigned long uaddr, + void *kaddr, unsigned long pgoff, unsigned long size) +{ + struct vm_struct *area; + unsigned long off; + unsigned long end_index; + + if (check_shl_overflow(pgoff, PMD_SHIFT, &off)) + return -EINVAL; + + size = PMD_ALIGN(size); + + if (!PMD_ALIGNED(uaddr) || !PMD_ALIGNED(kaddr)) + return -EINVAL; + + area = find_vm_area(kaddr); + if (!area) + return -EINVAL; + + if (!(area->flags & VM_USERMAP)) + return -EINVAL; + + if (check_add_overflow(size, off, &end_index) || + end_index > get_vm_area_size(area)) + return -EINVAL; + kaddr += off; + + do { + struct page *page = vmalloc_to_page(kaddr); + int ret; + + ret = hugetlb_insert_hugepage_pte_by_pa(vma->vm_mm, uaddr, + vma->vm_page_prot, page_to_phys(page)); + if (ret) + return ret; + + uaddr += PMD_SIZE; + kaddr += PMD_SIZE; + size -= PMD_SIZE; + } while (size > 0); + + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + + return 0; +} +EXPORT_SYMBOL(remap_vmalloc_hugepage_range_partial); + +/** + * remap_vmalloc_hugepage_range - map vmalloc hugepages to userspace + * @vma: vma to cover (map full range of vma) + * @addr: vmalloc memory + * @pgoff: number of hugepages into addr before first page to map + * + * Returns: 0 for success, -Exxx on failure + * + * This function checks that addr is a valid vmalloc'ed area, and + * that it is big enough to cover the vma. Will return failure if + * that criteria isn't met. + * + * Similar to remap_pfn_range() (see mm/memory.c) + */ +int remap_vmalloc_hugepage_range(struct vm_area_struct *vma, void *addr, + unsigned long pgoff) +{ + return remap_vmalloc_hugepage_range_partial(vma, vma->vm_start, + addr, pgoff, + vma->vm_end - vma->vm_start); +} +EXPORT_SYMBOL(remap_vmalloc_hugepage_range); + void free_vm_area(struct vm_struct *area) { struct vm_struct *ret;
From: Wang Wensheng wangwensheng4@huawei.com
ascend inclusion category: Feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NDAW CVE: NA
-------------------
The do_mmap/mmap_region/__mm_populate could only be used to handle the current process, now the share pool need to handle the other process and create memory mmaping, so need to export new function to distinguish different process and handle it, it would not break the current logic and only valid for share pool.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/mm.h | 11 +++++++++++ mm/gup.c | 24 +++++++++++++++++------- mm/mmap.c | 41 ++++++++++++++++++++++++++++++----------- 3 files changed, 58 insertions(+), 18 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h index f8ae3e41d5db..a4996e11cda6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2611,6 +2611,10 @@ extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior);
+extern unsigned long __do_mmap_mm(struct mm_struct *mm, struct file *file, + unsigned long addr, unsigned long len, unsigned long prot, + unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, + unsigned long *populate, struct list_head *uf); #ifdef CONFIG_MMU extern int __mm_populate(unsigned long addr, unsigned long len, int ignore_errors); @@ -2619,8 +2623,15 @@ static inline void mm_populate(unsigned long addr, unsigned long len) /* Ignore errors */ (void) __mm_populate(addr, len, 1); } +extern int do_mm_populate(struct mm_struct *mm, unsigned long addr, unsigned long len, + int ignore_errors); #else static inline void mm_populate(unsigned long addr, unsigned long len) {} +static inline int do_mm_populate(struct mm_struct *mm, unsigned long addr, unsigned long len, + int ignore_errors) +{ + return -EPERM; +} #endif
/* These take the mm semaphore themselves */ diff --git a/mm/gup.c b/mm/gup.c index ab544708191e..ee9c2c39c299 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1507,15 +1507,12 @@ long populate_vma_page_range(struct vm_area_struct *vma, }
/* - * __mm_populate - populate and/or mlock pages within a range of address space. - * - * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap - * flags. VMAs must be already marked with the desired vm_flags, and - * mmap_lock must not be held. + * do_mm_populate - populate and/or mlock pages within a range of + * address space for the specified mm_struct. */ -int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) +int do_mm_populate(struct mm_struct *mm, unsigned long start, unsigned long len, + int ignore_errors) { - struct mm_struct *mm = current->mm; unsigned long end, nstart, nend; struct vm_area_struct *vma = NULL; int locked = 0; @@ -1565,6 +1562,19 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) mmap_read_unlock(mm); return ret; /* 0 or negative error code */ } + +/* + * __mm_populate - populate and/or mlock pages within a range of address space. + * + * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap + * flags. VMAs must be already marked with the desired vm_flags, and + * mmap_lock must not be held. + */ +int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) +{ + return do_mm_populate(current->mm, start, len, ignore_errors); +} + #else /* CONFIG_MMU */ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, diff --git a/mm/mmap.c b/mm/mmap.c index 3991634121d7..0af1300734a2 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1399,12 +1399,17 @@ static inline bool file_mmap_ok(struct file *file, struct inode *inode, return true; }
-static inline unsigned long -__do_mmap(struct file *file, unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, vm_flags_t vm_flags, - unsigned long pgoff, unsigned long *populate, struct list_head *uf) +static unsigned long __mmap_region(struct mm_struct *mm, + struct file *file, unsigned long addr, + unsigned long len, vm_flags_t vm_flags, + unsigned long pgoff, struct list_head *uf); + +unsigned long __do_mmap_mm(struct mm_struct *mm, struct file *file, + unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, + vm_flags_t vm_flags, unsigned long pgoff, + unsigned long *populate, struct list_head *uf) { - struct mm_struct *mm = current->mm; int pkey = 0;
*populate = 0; @@ -1587,14 +1592,22 @@ __do_mmap(struct file *file, unsigned long addr, unsigned long len, if (flags & MAP_CHECKNODE) set_vm_checknode(&vm_flags, flags);
- addr = mmap_region(file, addr, len, vm_flags, pgoff, uf); + addr = __mmap_region(mm, file, addr, len, vm_flags, pgoff, uf); if (!IS_ERR_VALUE(addr) && ((vm_flags & VM_LOCKED) || (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) *populate = len; return addr; } +EXPORT_SYMBOL(__do_mmap_mm);
+static inline unsigned long +__do_mmap(struct file *file, unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, vm_flags_t vm_flags, + unsigned long pgoff, unsigned long *populate, struct list_head *uf) +{ + return __do_mmap_mm(current->mm, file, addr, len, prot, flags, vm_flags, pgoff, populate, uf); +} #ifdef CONFIG_USERSWAP /* * Check if pages between 'addr ~ addr+len' can be user swapped. If so, get @@ -1955,11 +1968,11 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; }
-unsigned long mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf) +static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, + unsigned long addr, unsigned long len, + vm_flags_t vm_flags, unsigned long pgoff, + struct list_head *uf) { - struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev, *merge; int error; struct rb_node **rb_link, *rb_parent; @@ -2148,6 +2161,13 @@ unsigned long mmap_region(struct file *file, unsigned long addr, return error; }
+unsigned long mmap_region(struct file *file, unsigned long addr, + unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, + struct list_head *uf) +{ + return __mmap_region(current->mm, file, addr, len, vm_flags, pgoff, uf); +} + static unsigned long unmapped_area(struct vm_unmapped_area_info *info) { /* @@ -3209,7 +3229,6 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) return __vm_munmap(addr, len, true); }
- /* * Emulation of deprecated remap_file_pages() syscall. */
From: Wang Wensheng wangwensheng4@huawei.com
ascend inclusion category: Feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NDAW CVE: NA
-------------------
In some scenarios, there are memory only numa nodes used for designed process groups. Users need to specify memory nodes to apply for shared memory.
Here we extend shmem_alloc_and_acct_page() to accept an extra node_id and supply that node_id through share_pool interface in later patches.
Signed-off-by: Peng Wu wupeng58@huawei.com Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/shmem.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-)
diff --git a/mm/shmem.c b/mm/shmem.c index b488b6373454..51f8f3b75803 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1532,7 +1532,7 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, }
static struct page *shmem_alloc_hugepage(gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) + struct shmem_inode_info *info, pgoff_t index, int node_id) { struct vm_area_struct pvma; struct address_space *mapping = info->vfs_inode.i_mapping; @@ -1546,7 +1546,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
shmem_pseudo_vma_init(&pvma, info, hindex); page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, - HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true); + HPAGE_PMD_ORDER, &pvma, 0, node_id, true); shmem_pseudo_vma_destroy(&pvma); if (page) prep_transhuge_page(page); @@ -1556,13 +1556,14 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, }
static struct page *shmem_alloc_page(gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) + struct shmem_inode_info *info, pgoff_t index, + int node_id) { struct vm_area_struct pvma; struct page *page;
shmem_pseudo_vma_init(&pvma, info, index); - page = alloc_page_vma(gfp, &pvma, 0); + page = alloc_pages_vma(gfp, 0, &pvma, 0, node_id, false); shmem_pseudo_vma_destroy(&pvma);
return page; @@ -1570,7 +1571,7 @@ static struct page *shmem_alloc_page(gfp_t gfp,
static struct page *shmem_alloc_and_acct_page(gfp_t gfp, struct inode *inode, - pgoff_t index, bool huge) + pgoff_t index, bool huge, int node_id) { struct shmem_inode_info *info = SHMEM_I(inode); struct page *page; @@ -1585,9 +1586,9 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp, goto failed;
if (huge) - page = shmem_alloc_hugepage(gfp, info, index); + page = shmem_alloc_hugepage(gfp, info, index, node_id); else - page = shmem_alloc_page(gfp, info, index); + page = shmem_alloc_page(gfp, info, index, node_id); if (page) { __SetPageLocked(page); __SetPageSwapBacked(page); @@ -1636,7 +1637,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, * limit chance of success by further cpuset and node constraints. */ gfp &= ~GFP_CONSTRAINT_MASK; - newpage = shmem_alloc_page(gfp, info, index); + newpage = shmem_alloc_page(gfp, info, index, numa_node_id()); if (!newpage) return -ENOMEM;
@@ -1888,11 +1889,11 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, }
alloc_huge: - page = shmem_alloc_and_acct_page(gfp, inode, index, true); + page = shmem_alloc_and_acct_page(gfp, inode, index, true, numa_node_id()); if (IS_ERR(page)) { alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode, - index, false); + index, false, numa_node_id()); } if (IS_ERR(page)) { int retry = 5; @@ -2379,7 +2380,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, }
if (!*pagep) { - page = shmem_alloc_page(gfp, info, pgoff); + page = shmem_alloc_page(gfp, info, pgoff, numa_node_id()); if (!page) goto out_unacct_blocks;
From: Wang Wensheng wangwensheng4@huawei.com
ascend inclusion category: Feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NDAW CVE: NA
-------------------
The share pool features is a big feature, it is mainly used to share user virtual memory for different processes in the same group. It could be used by this steps: 1. Process A create a new group which is owned by process A. 2. Process A add process B to the group. 3. Process A add process C to the same group. 4. Process B alloc a new memory VA, and write something in it. 5. The VA was send to the process C by IPC, then process C got it. 6. The process C access the VA and got the data directly. 7. The process A could add more processes in the group to share the memory. 8. Fix the memory by use the free function or exit the group.
The new features is enabled both by CONFIG_ASCEND_SHARE_POOL and the enable_ascend_share_pool bootarg, it would not affect anything if disabled.
Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/Kconfig | 9 ++ include/linux/mm_types.h | 4 + include/linux/share_pool.h | 226 +++++++++++++++++++++++++++++++++++++ kernel/fork.c | 4 + mm/Makefile | 1 + mm/share_pool.c | 45 ++++++++ 6 files changed, 289 insertions(+) create mode 100644 include/linux/share_pool.h create mode 100644 mm/share_pool.c
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 85610dd3ac0c..405e5ce460ce 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2034,6 +2034,15 @@ config ASCEND_CHARGE_MIGRATE_HUGEPAGES This option enable the feature to charge migrate hugepages to memory cgroup.
+config ASCEND_SHARE_POOL + bool "Enable support for the Share Pool Memory" + default n + depends on HAVE_ARCH_HUGE_VMALLOC + select ARCH_USES_HIGH_VMA_FLAGS + help + This feature allows multiple processes to share virtual memory both + in kernel and user level, which is only enabled for ascend platform. + endif
endmenu diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 2729eb58aca4..30b36a3adb87 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -585,6 +585,10 @@ struct mm_struct { struct kvm *kvm; #endif
+#if IS_ENABLED(CONFIG_ASCEND_SHARE_POOL) + struct sp_group_master *sp_group_master; +#endif + /* * The mm_cpumask needs to be at the end of mm_struct, because it * is dynamically sized based on nr_cpu_ids. diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h new file mode 100644 index 000000000000..84d9c539f12b --- /dev/null +++ b/include/linux/share_pool.h @@ -0,0 +1,226 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_SHARE_POOL_H +#define LINUX_SHARE_POOL_H + +#include <linux/mman.h> +#include <linux/mm_types.h> +#include <linux/notifier.h> +#include <linux/vmalloc.h> +#include <linux/printk.h> +#include <linux/hashtable.h> +#include <linux/numa.h> +#include <linux/jump_label.h> + +#define SP_HUGEPAGE (1 << 0) +#define SP_HUGEPAGE_ONLY (1 << 1) +#define SP_DVPP (1 << 2) +#define SP_SPEC_NODE_ID (1 << 3) + +#define DEVICE_ID_BITS 4UL +#define DEVICE_ID_MASK ((1UL << DEVICE_ID_BITS) - 1UL) +#define DEVICE_ID_SHIFT 32UL +#define NODE_ID_BITS NODES_SHIFT +#define NODE_ID_MASK ((1UL << NODE_ID_BITS) - 1UL) +#define NODE_ID_SHIFT (DEVICE_ID_SHIFT + DEVICE_ID_BITS) + +#define SP_FLAG_MASK (SP_HUGEPAGE | SP_HUGEPAGE_ONLY | SP_DVPP | \ + SP_SPEC_NODE_ID | \ + (DEVICE_ID_MASK << DEVICE_ID_SHIFT) | \ + (NODE_ID_MASK << NODE_ID_SHIFT)) + +#define sp_flags_device_id(flags) (((flags) >> DEVICE_ID_SHIFT) & DEVICE_ID_MASK) +#define sp_flags_node_id(flags) (((flags) >> NODE_ID_SHIFT) & NODE_ID_MASK) + +#define SPG_ID_NONE (-1) /* not associated with sp_group, only for specified thread */ +#define SPG_ID_DEFAULT 0 /* use the spg id of current thread */ +#define SPG_ID_MIN 1 /* valid id should be >= 1 */ +#define SPG_ID_MAX 99999 +#define SPG_ID_AUTO_MIN 100000 +#define SPG_ID_AUTO_MAX 199999 +#define SPG_ID_AUTO 200000 /* generate group id automatically */ + +#define MAX_DEVID 8 /* the max num of Da-vinci devices */ + +extern int sysctl_share_pool_hugepage_enable; + +extern int sysctl_ac_mode; + +extern int sysctl_sp_debug_mode; + +extern struct static_key_false share_pool_enabled_key; + +extern int sysctl_share_pool_map_lock_enable; + +extern int sysctl_sp_compact_enable; +extern unsigned long sysctl_sp_compact_interval; +extern unsigned long sysctl_sp_compact_interval_max; +extern int sysctl_sp_perf_alloc; + +extern int sysctl_sp_perf_k2u; + +/* we estimate an sp-group ususally contains at most 64 sp-group */ +#define SP_SPG_HASH_BITS 6 + +struct sp_spg_stat { + int spg_id; + /* record the number of hugepage allocation failures */ + atomic_t hugepage_failures; + /* number of sp_area */ + atomic_t spa_num; + /* total size of all sp_area from sp_alloc and k2u */ + atomic64_t size; + /* total size of all sp_area from sp_alloc 0-order page */ + atomic64_t alloc_nsize; + /* total size of all sp_area from sp_alloc hugepage */ + atomic64_t alloc_hsize; + /* total size of all sp_area from ap_alloc */ + atomic64_t alloc_size; + /* total size of all sp_area from sp_k2u */ + atomic64_t k2u_size; + struct mutex lock; /* protect hashtable */ + DECLARE_HASHTABLE(hash, SP_SPG_HASH_BITS); +}; + +/* we estimate a process ususally belongs to at most 16 sp-group */ +#define SP_PROC_HASH_BITS 4 + +/* per process memory usage statistics indexed by tgid */ +struct sp_proc_stat { + atomic_t use_count; + int tgid; + struct mm_struct *mm; + struct mutex lock; /* protect hashtable */ + DECLARE_HASHTABLE(hash, SP_PROC_HASH_BITS); + char comm[TASK_COMM_LEN]; + /* + * alloc amount minus free amount, may be negative when freed by + * another task in the same sp group. + */ + atomic64_t alloc_size; + atomic64_t k2u_size; +}; + +/* Processes in the same sp_group can share memory. + * Memory layout for share pool: + * + * |-------------------- 8T -------------------|---|------ 8T ------------| + * | Device 0 | Device 1 |...| | + * |----------------------------------------------------------------------| + * |------------- 16G -------------| 16G | | | + * | DVPP GROUP0 | DVPP GROUP1 | ... | ... |...| sp normal memory | + * | sp | sp | | | | | + * |----------------------------------------------------------------------| + * + * The host SVM feature reserves 8T virtual memory by mmap, and due to the + * restriction of DVPP, while SVM and share pool will both allocate memory + * for DVPP, the memory have to be in the same 32G range. + * + * Share pool reserves 16T memory, with 8T for normal uses and 8T for DVPP. + * Within this 8T DVPP memory, SVM will call sp_config_dvpp_range() to + * tell us which 16G memory range is reserved for share pool . + * + * In some scenarios where there is no host SVM feature, share pool uses + * the default 8G memory setting for DVPP. + */ +struct sp_group { + int id; + struct file *file; + struct file *file_hugetlb; + /* number of process in this group */ + int proc_num; + /* list head of processes (sp_group_node, each represents a process) */ + struct list_head procs; + /* list head of sp_area. it is protected by spin_lock sp_area_lock */ + struct list_head spa_list; + /* group statistics */ + struct sp_spg_stat *stat; + /* we define the creator process of a sp_group as owner */ + struct task_struct *owner; + /* is_alive == false means it's being destroyed */ + bool is_alive; + atomic_t use_count; + /* protect the group internal elements, except spa_list */ + struct rw_semaphore rw_lock; +}; + +/* a per-process(per mm) struct which manages a sp_group_node list */ +struct sp_group_master { + /* + * number of sp groups the process belongs to, + * a.k.a the number of sp_node in node_list + */ + unsigned int count; + /* list head of sp_node */ + struct list_head node_list; + struct mm_struct *mm; + struct sp_proc_stat *stat; +}; + +/* + * each instance represents an sp group the process belongs to + * sp_group_master : sp_group_node = 1 : N + * sp_group_node->spg : sp_group = 1 : 1 + * sp_group_node : sp_group->procs = N : 1 + */ +struct sp_group_node { + /* list node in sp_group->procs */ + struct list_head proc_node; + /* list node in sp_group_maseter->node_list */ + struct list_head group_node; + struct sp_group_master *master; + struct sp_group *spg; + unsigned long prot; +}; + +struct sp_walk_data { + struct page **pages; + unsigned int page_count; + unsigned long uva_aligned; + unsigned long page_size; + bool is_hugepage; + pmd_t *pmd; +}; + +#define MAP_SHARE_POOL 0x200000 + +#define MMAP_TOP_4G_SIZE 0x100000000UL + +/* 8T size */ +#define MMAP_SHARE_POOL_NORMAL_SIZE 0x80000000000UL +/* 8T size*/ +#define MMAP_SHARE_POOL_DVPP_SIZE 0x80000000000UL +/* 16G size */ +#define MMAP_SHARE_POOL_16G_SIZE 0x400000000UL +#define MMAP_SHARE_POOL_SIZE (MMAP_SHARE_POOL_NORMAL_SIZE + MMAP_SHARE_POOL_DVPP_SIZE) +/* align to 2M hugepage size, and MMAP_SHARE_POOL_TOP_16G_START should be align to 16G */ +#define MMAP_SHARE_POOL_END ((TASK_SIZE - MMAP_SHARE_POOL_DVPP_SIZE) & ~((1 << 21) - 1)) +#define MMAP_SHARE_POOL_START (MMAP_SHARE_POOL_END - MMAP_SHARE_POOL_SIZE) +#define MMAP_SHARE_POOL_16G_START (MMAP_SHARE_POOL_END - MMAP_SHARE_POOL_DVPP_SIZE) + +#ifdef CONFIG_ASCEND_SHARE_POOL + +static inline void sp_init_mm(struct mm_struct *mm) +{ + mm->sp_group_master = NULL; +} + +extern int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id); +extern int sp_group_add_task(int pid, int spg_id); + +static inline bool sp_is_enabled(void) +{ + return static_branch_likely(&share_pool_enabled_key); +} + +#else /* CONFIG_ASCEND_SHARE_POOL */ + +static inline void sp_init_mm(struct mm_struct *mm) { } + +static inline bool sp_is_enabled(void) +{ + return false; +} + +#endif /* !CONFIG_ASCEND_SHARE_POOL */ + +#endif /* LINUX_SHARE_POOL_H */ diff --git a/kernel/fork.c b/kernel/fork.c index d730b57c9c22..454b42af1de8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -96,6 +96,7 @@ #include <linux/kasan.h> #include <linux/scs.h> #include <linux/io_uring.h> +#include <linux/share_pool.h>
#include <asm/pgalloc.h> #include <linux/uaccess.h> @@ -1055,6 +1056,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, goto fail_nocontext;
mm->user_ns = get_user_ns(user_ns); + + sp_init_mm(mm); + return mm;
fail_nocontext: diff --git a/mm/Makefile b/mm/Makefile index c14522bd17ed..ec3d0ab14a6a 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -126,3 +126,4 @@ obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o obj-$(CONFIG_PIN_MEMORY) += pin_mem.o obj-$(CONFIG_SHRINK_PAGECACHE) += page_cache_limit.o +obj-$(CONFIG_ASCEND_SHARE_POOL) += share_pool.o diff --git a/mm/share_pool.c b/mm/share_pool.c new file mode 100644 index 000000000000..32d473bada3a --- /dev/null +++ b/mm/share_pool.c @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Huawei Ascend Share Pool Memory + * + * Copyright (C) 2020 Huawei Limited + * Author: Tang Yizhou tangyizhou@huawei.com + * Zefan Li lizefan@huawei.com + * Wu Peng wupeng58@huawei.com + * Ding Tianhong dingtgianhong@huawei.com + * Zhou Guanghui zhouguanghui1@huawei.com + * Li Ming limingming.li@huawei.com + * + * This code is based on the hisilicon ascend platform. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#define pr_fmt(fmt) "share pool: " fmt + +#include <linux/share_pool.h> + +int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id) +{ + return 0; +} +EXPORT_SYMBOL_GPL(mg_sp_group_add_task); + +int sp_group_add_task(int pid, int spg_id) +{ + return 0; +} +EXPORT_SYMBOL_GPL(sp_group_add_task); + +DEFINE_STATIC_KEY_FALSE(share_pool_enabled_key); + +static int __init enable_share_pool(char *s) +{ + static_branch_enable(&share_pool_enabled_key); + pr_info("Ascend enable share pool features via bootargs\n"); + + return 1; +} +__setup("enable_ascend_share_pool", enable_share_pool);
From: Wang Wensheng wangwensheng4@huawei.com
ascend inclusion category: Feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NDAW CVE: NA
-------------------
Add empty implementation for all the exported symbols and we will implement those functions one by one in later patches.
Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Peng Wu wupeng58@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/share_pool.h | 257 ++++++++++++++++++++++- mm/share_pool.c | 411 +++++++++++++++++++++++++++++++++++++ 2 files changed, 667 insertions(+), 1 deletion(-)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 84d9c539f12b..37a26487a7d8 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -204,6 +204,56 @@ static inline void sp_init_mm(struct mm_struct *mm) mm->sp_group_master = NULL; }
+/* + * Those interfaces are exported for modules + */ +extern int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id); +extern int sp_group_add_task(int pid, int spg_id); + +extern int mg_sp_group_del_task(int pid, int spg_id); +extern int sp_group_del_task(int pid, int spg_id); + +extern int mg_sp_group_id_by_pid(int pid, int *spg_ids, int *num); +extern int sp_group_id_by_pid(int pid); + +extern int sp_group_walk(int spg_id, void *data, int (*func)(struct mm_struct *mm, void *)); +extern int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task); + +extern void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id); +extern void *mg_sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id); + +extern int sp_free(unsigned long addr); +extern int mg_sp_free(unsigned long addr); + +extern void *sp_make_share_k2u(unsigned long kva, unsigned long size, + unsigned long sp_flags, int pid, int spg_id); +extern void *mg_sp_make_share_k2u(unsigned long kva, unsigned long size, + unsigned long sp_flags, int pid, int spg_id); + +extern void *sp_make_share_u2k(unsigned long uva, unsigned long size, int pid); +extern void *mg_sp_make_share_u2k(unsigned long uva, unsigned long size, int pid); + +extern int sp_unshare(unsigned long va, unsigned long size, int pid, int spg_id); +extern int mg_sp_unshare(unsigned long va, unsigned long size); + +extern int sp_walk_page_range(unsigned long uva, unsigned long size, + struct task_struct *tsk, struct sp_walk_data *sp_walk_data); +extern int mg_sp_walk_page_range(unsigned long uva, unsigned long size, + struct task_struct *tsk, struct sp_walk_data *sp_walk_data); + +extern void sp_walk_page_free(struct sp_walk_data *sp_walk_data); +extern void mg_sp_walk_page_free(struct sp_walk_data *sp_walk_data); + +extern int sp_register_notifier(struct notifier_block *nb); +extern int sp_unregister_notifier(struct notifier_block *nb); + +extern bool sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid); +extern bool mg_sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid); + +extern bool is_sharepool_addr(unsigned long addr); +extern bool mg_is_sharepool_addr(unsigned long addr); + extern int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id); extern int sp_group_add_task(int pid, int spg_id);
@@ -214,13 +264,218 @@ static inline bool sp_is_enabled(void)
#else /* CONFIG_ASCEND_SHARE_POOL */
-static inline void sp_init_mm(struct mm_struct *mm) { } +static inline int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id) +{ + return -EPERM; +} + +static inline int sp_group_add_task(int pid, int spg_id) +{ + return -EPERM; +} + +static inline int mg_sp_group_del_task(int pid, int spg_id) +{ + return -EPERM; +} + +static inline int sp_group_del_task(int pid, int spg_id) +{ + return -EPERM; +} + +static inline int sp_group_exit(struct mm_struct *mm) +{ + return 0; +} + +static inline void sp_group_post_exit(struct mm_struct *mm) +{ +} + +static inline int mg_sp_group_id_by_pid(int pid, int *spg_ids, int *num) +{ + return -EPERM; +} + +static inline int sp_group_id_by_pid(int pid) +{ + return -EPERM; +} + +static inline int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + return -EPERM; +} + +static inline void *sp_alloc(unsigned long size, unsigned long sp_flags, int sp_id) +{ + return NULL; +} + +static inline void *mg_sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) +{ + return NULL; +} + +static inline int sp_free(unsigned long addr) +{ + return -EPERM; +} + +static inline int mg_sp_free(unsigned long addr) +{ + return -EPERM; +} + +static inline void *sp_make_share_k2u(unsigned long kva, unsigned long size, + unsigned long sp_flags, int pid, int spg_id) +{ + return NULL; +} + +static inline void *mg_sp_make_share_k2u(unsigned long kva, unsigned long size, + unsigned long sp_flags, int pid, int spg_id) +{ + return NULL; +} + +static inline void *sp_make_share_u2k(unsigned long uva, unsigned long size, int pid) +{ + return NULL; +} + +static inline void *mg_sp_make_share_u2k(unsigned long uva, unsigned long size, int pid) +{ + return NULL; +} + +static inline int sp_unshare(unsigned long va, unsigned long size, int pid, int spg_id) +{ + return -EPERM; +} + +static inline int mg_sp_unshare(unsigned long va, unsigned long size) +{ + return -EPERM; +} + + +static inline void sp_init_mm(struct mm_struct *mm) +{ +} + +static inline void sp_area_drop(struct vm_area_struct *vma) +{ +} + +static inline int sp_walk_page_range(unsigned long uva, unsigned long size, + struct task_struct *tsk, struct sp_walk_data *sp_walk_data) +{ + return 0; +} + +static inline int mg_sp_walk_page_range(unsigned long uva, unsigned long size, + struct task_struct *tsk, struct sp_walk_data *sp_walk_data) +{ + return 0; +} + +static inline void sp_walk_page_free(struct sp_walk_data *sp_walk_data) +{ +} + +static inline void mg_sp_walk_page_free(struct sp_walk_data *sp_walk_data) +{ +} + +static inline int sp_register_notifier(struct notifier_block *nb) +{ + return -EPERM; +} + +static inline int sp_unregister_notifier(struct notifier_block *nb) +{ + return -EPERM; +} + +static inline bool sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid) +{ + return false; +} + +static inline bool mg_sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid) +{ + return false; +} + +static inline bool is_sharepool_addr(unsigned long addr) +{ + return false; +} + +static inline bool mg_is_sharepool_addr(unsigned long addr) +{ + return false; +} + +static inline struct sp_proc_stat *sp_get_proc_stat_ref(struct mm_struct *mm) +{ + return NULL; +} + +static inline void sp_proc_stat_drop(struct sp_proc_stat *stat) +{ +} + +static inline void spa_overview_show(struct seq_file *seq) +{ +} + +static inline void spg_overview_show(struct seq_file *seq) +{ +}
static inline bool sp_is_enabled(void) { return false; }
+static inline void sp_area_work_around(struct vm_unmapped_area_info *info) +{ +} + +static inline bool sp_check_vm_share_pool(unsigned long vm_flags) +{ + return false; +} + +static inline bool is_vm_huge_special(struct vm_area_struct *vma) +{ + return false; +} + +static inline bool is_vmalloc_sharepool(unsigned long vm_flags) +{ + return NULL; +} + +static inline int sp_node_id(struct vm_area_struct *vma) +{ + return numa_node_id(); +} + +static inline bool sp_check_addr(unsigned long addr) +{ + return false; +} + +static inline bool sp_check_mmap_addr(unsigned long addr, unsigned long flags) +{ + return false; +} + #endif /* !CONFIG_ASCEND_SHARE_POOL */
#endif /* LINUX_SHARE_POOL_H */ diff --git a/mm/share_pool.c b/mm/share_pool.c index 32d473bada3a..67b1aad7d393 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -21,6 +21,110 @@
#include <linux/share_pool.h>
+/* access control mode macros */ +#define AC_NONE 0 +#define AC_SINGLE_OWNER 1 + +#define spg_valid(spg) ((spg)->is_alive == true) + +#define byte2kb(size) ((size) >> 10) +#define byte2mb(size) ((size) >> 20) +#define page2kb(page_num) ((page_num) << (PAGE_SHIFT - 10)) + +#define SINGLE_GROUP_MODE 1 +#define MULTI_GROUP_MODE 2 + +#define MAX_GROUP_FOR_SYSTEM 50000 +#define MAX_GROUP_FOR_TASK 3000 +#define MAX_PROC_PER_GROUP 1024 + +#define GROUP_NONE 0 + +#define SEC2US(sec) ((sec) * 1000000) +#define NS2US(ns) ((ns) / 1000) + +#define PF_DOMAIN_CORE 0x10000000 /* AOS CORE processes in sched.h */ + +/* mdc scene hack */ +static int __read_mostly enable_mdc_default_group; +static const int mdc_default_group_id = 1; + +/* share the uva to the whole group */ +static int __read_mostly enable_share_k2u_spg = 1; + +static int share_pool_group_mode = SINGLE_GROUP_MODE; + +static unsigned int sp_device_number; +static unsigned long sp_dev_va_start[MAX_DEVID]; +static unsigned long sp_dev_va_size[MAX_DEVID]; + +static bool is_sp_dev_addr_enabled(int device_id) +{ + return sp_dev_va_size[device_id]; +} + +static BLOCKING_NOTIFIER_HEAD(sp_notifier_chain); + +/* + * Group '0' for k2u_task and pass through. No process will be actually + * added to. + */ +static struct sp_group *spg_none; + +static struct sp_group *create_spg(int spg_id) +{ + return NULL; +} + +static bool is_online_node_id(int node_id) +{ + return node_id >= 0 && node_id < MAX_NUMNODES && node_online(node_id); +} + +static bool is_device_addr(unsigned long addr) +{ + int i; + + for (i = 0; i < sp_device_number; i++) { + if (addr >= sp_dev_va_start[i] && + addr < sp_dev_va_start[i] + sp_dev_va_size[i]) + return true; + } + return false; +} + +/** + * sp_group_id_by_pid() - Get the sp_group ID of a process. + * @pid: pid of target process. + * + * Return: + * 0 the sp_group ID. + * -ENODEV target process doesn't belong to any sp_group. + */ +int sp_group_id_by_pid(int pid) +{ + return 0; +} +EXPORT_SYMBOL_GPL(sp_group_id_by_pid); + +/** + * mp_sp_group_id_by_pid() - Get the sp_group ID array of a process. + * @pid: pid of target process. + * @spg_ids: point to an array to save the group ids the process belongs to + * @num: input the spg_ids array size; output the spg number of the process + * + * Return: + * >0 - the sp_group ID. + * -ENODEV - target process doesn't belong to any sp_group. + * -EINVAL - spg_ids or num is NULL. + * -E2BIG - the num of groups process belongs to is larger than *num + */ +int mg_sp_group_id_by_pid(int pid, int *spg_ids, int *num) +{ + return 0; +} +EXPORT_SYMBOL_GPL(mg_sp_group_id_by_pid); + int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id) { return 0; @@ -33,6 +137,283 @@ int sp_group_add_task(int pid, int spg_id) } EXPORT_SYMBOL_GPL(sp_group_add_task);
+/** + * mg_sp_group_del_task() - delete a process from a sp group. + * @pid: the pid of the task to be deleted + * @spg_id: sharepool group id + * + * the group's spa list must be empty, or deletion will fail. + * + * Return: + * * if success, return 0. + * * -EINVAL, spg_id invalid or spa_lsit not emtpy or spg dead + * * -ESRCH, the task group of pid is not in group / process dead + */ +int mg_sp_group_del_task(int pid, int spg_id) +{ + return 0; +} +EXPORT_SYMBOL_GPL(mg_sp_group_del_task); + +int sp_group_del_task(int pid, int spg_id) +{ + return mg_sp_group_del_task(pid, spg_id); +} +EXPORT_SYMBOL_GPL(sp_group_del_task); + +/** + * sp_free() - Free the memory allocated by sp_alloc(). + * @addr: the starting VA of the memory. + * + * Return: + * * 0 - success. + * * -EINVAL - the memory can't be found or was not allocted by share pool. + * * -EPERM - the caller has no permision to free the memory. + */ +int sp_free(unsigned long addr) +{ + return 0; +} +EXPORT_SYMBOL_GPL(sp_free); + +int mg_sp_free(unsigned long addr) +{ + return sp_free(addr); +} +EXPORT_SYMBOL_GPL(mg_sp_free); + +/** + * sp_alloc() - Allocate shared memory for all the processes in a sp_group. + * @size: the size of memory to allocate. + * @sp_flags: how to allocate the memory. + * @spg_id: the share group that the memory is allocated to. + * + * Use pass through allocation if spg_id == SPG_ID_DEFAULT in multi-group mode. + * + * Return: + * * if succeed, return the starting address of the shared memory. + * * if fail, return the pointer of -errno. + */ +void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) +{ + return NULL; +} +EXPORT_SYMBOL_GPL(sp_alloc); + +void *mg_sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) +{ + return sp_alloc(size, sp_flags, spg_id); +} +EXPORT_SYMBOL_GPL(mg_sp_alloc); + +/** + * sp_make_share_k2u() - Share kernel memory to current process or an sp_group. + * @kva: the VA of shared kernel memory. + * @size: the size of shared kernel memory. + * @sp_flags: how to allocate the memory. We only support SP_DVPP. + * @pid: the pid of the specified process (Not currently in use). + * @spg_id: the share group that the memory is shared to. + * + * Return: the shared target user address to start at + * + * Share kernel memory to current task if spg_id == SPG_ID_NONE + * or SPG_ID_DEFAULT in multi-group mode. + * + * Return: + * * if succeed, return the shared user address to start at. + * * if fail, return the pointer of -errno. + */ +void *sp_make_share_k2u(unsigned long kva, unsigned long size, + unsigned long sp_flags, int pid, int spg_id) +{ + return NULL; +} +EXPORT_SYMBOL_GPL(sp_make_share_k2u); + +void *mg_sp_make_share_k2u(unsigned long kva, unsigned long size, + unsigned long sp_flags, int pid, int spg_id) +{ + return sp_make_share_k2u(kva, size, sp_flags, pid, spg_id); +} +EXPORT_SYMBOL_GPL(mg_sp_make_share_k2u); + +/** + * sp_make_share_u2k() - Share user memory of a specified process to kernel. + * @uva: the VA of shared user memory + * @size: the size of shared user memory + * @pid: the pid of the specified process(Not currently in use) + * + * Return: + * * if success, return the starting kernel address of the shared memory. + * * if failed, return the pointer of -errno. + */ +void *sp_make_share_u2k(unsigned long uva, unsigned long size, int pid) +{ + return NULL; +} +EXPORT_SYMBOL_GPL(sp_make_share_u2k); + +void *mg_sp_make_share_u2k(unsigned long uva, unsigned long size, int pid) +{ + return sp_make_share_u2k(uva, size, pid); +} +EXPORT_SYMBOL_GPL(mg_sp_make_share_u2k); + +/** + * sp_unshare() - Unshare the kernel or user memory which shared by calling + * sp_make_share_{k2u,u2k}(). + * @va: the specified virtual address of memory + * @size: the size of unshared memory + * + * Use spg_id of current thread if spg_id == SPG_ID_DEFAULT. + * + * Return: 0 for success, -errno on failure. + */ +int sp_unshare(unsigned long va, unsigned long size, int pid, int spg_id) +{ + return 0; +} +EXPORT_SYMBOL_GPL(sp_unshare); + +int mg_sp_unshare(unsigned long va, unsigned long size) +{ + return sp_unshare(va, size, 0, 0); +} +EXPORT_SYMBOL_GPL(mg_sp_unshare); + +/** + * sp_walk_page_range() - Walk page table with caller specific callbacks. + * @uva: the start VA of user memory. + * @size: the size of user memory. + * @tsk: task struct of the target task. + * @sp_walk_data: a structure of a page pointer array. + * + * Return: 0 for success, -errno on failure. + * + * When return 0, sp_walk_data describing [uva, uva+size) can be used. + * When return -errno, information in sp_walk_data is useless. + */ +int sp_walk_page_range(unsigned long uva, unsigned long size, + struct task_struct *tsk, struct sp_walk_data *sp_walk_data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(sp_walk_page_range); + +int mg_sp_walk_page_range(unsigned long uva, unsigned long size, + struct task_struct *tsk, struct sp_walk_data *sp_walk_data) +{ + return sp_walk_page_range(uva, size, tsk, sp_walk_data); +} +EXPORT_SYMBOL_GPL(mg_sp_walk_page_range); + +/** + * sp_walk_page_free() - Free the sp_walk_data structure. + * @sp_walk_data: a structure of a page pointer array to be freed. + */ +void sp_walk_page_free(struct sp_walk_data *sp_walk_data) +{ + return; +} +EXPORT_SYMBOL_GPL(sp_walk_page_free); + +void mg_sp_walk_page_free(struct sp_walk_data *sp_walk_data) +{ + sp_walk_page_free(sp_walk_data); +} +EXPORT_SYMBOL_GPL(mg_sp_walk_page_free); + +int sp_register_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&sp_notifier_chain, nb); +} +EXPORT_SYMBOL_GPL(sp_register_notifier); + +int sp_unregister_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&sp_notifier_chain, nb); +} +EXPORT_SYMBOL_GPL(sp_unregister_notifier); + +/** + * sp_config_dvpp_range() - User can config the share pool start address + * of each Da-vinci device. + * @start: the value of share pool start + * @size: the value of share pool + * @device_id: the num of Da-vinci device + * @pid: the pid of device process + * + * Return true for success. + * Return false if parameter invalid or has been set up. + * This functuon has no concurrent problem. + */ +bool sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid) +{ + if (pid < 0 || + size <= 0 || size > MMAP_SHARE_POOL_16G_SIZE || + device_id < 0 || device_id >= sp_device_number || + !is_online_node_id(device_id) || + is_sp_dev_addr_enabled(device_id)) + return false; + + sp_dev_va_start[device_id] = start; + sp_dev_va_size[device_id] = size; + return true; +} +EXPORT_SYMBOL_GPL(sp_config_dvpp_range); + +bool mg_sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid) +{ + return sp_config_dvpp_range(start, size, device_id, pid); +} +EXPORT_SYMBOL_GPL(mg_sp_config_dvpp_range); + +static bool is_sp_normal_addr(unsigned long addr) +{ + return addr >= MMAP_SHARE_POOL_START && + addr < MMAP_SHARE_POOL_16G_START + + sp_device_number * MMAP_SHARE_POOL_16G_SIZE; +} + +/** + * is_sharepool_addr() - Check if a user memory address belongs to share pool. + * @addr: the userspace address to be checked. + * + * Return true if addr belongs to share pool, or false vice versa. + */ +bool is_sharepool_addr(unsigned long addr) +{ + return is_sp_normal_addr(addr) || is_device_addr(addr); +} +EXPORT_SYMBOL_GPL(is_sharepool_addr); + +bool mg_is_sharepool_addr(unsigned long addr) +{ + return is_sharepool_addr(addr); +} +EXPORT_SYMBOL_GPL(mg_is_sharepool_addr); + +static int __init mdc_default_group(char *s) +{ + enable_mdc_default_group = 1; + return 1; +} +__setup("enable_mdc_default_group", mdc_default_group); + +static int __init enable_share_k2u_to_group(char *s) +{ + enable_share_k2u_spg = 1; + return 1; +} +__setup("enable_sp_share_k2u_spg", enable_share_k2u_to_group); + +static int __init enable_sp_multi_group_mode(char *s) +{ + share_pool_group_mode = MULTI_GROUP_MODE; + return 1; +} +__setup("enable_sp_multi_group_mode", enable_sp_multi_group_mode); + DEFINE_STATIC_KEY_FALSE(share_pool_enabled_key);
static int __init enable_share_pool(char *s) @@ -43,3 +424,33 @@ static int __init enable_share_pool(char *s) return 1; } __setup("enable_ascend_share_pool", enable_share_pool); + +static void __init sp_device_number_detect(void) +{ + /* NOTE: TO BE COMPLETED */ + sp_device_number = 4; + + if (sp_device_number > MAX_DEVID) { + pr_warn("sp_device_number %d exceed, truncate it to %d\n", + sp_device_number, MAX_DEVID); + sp_device_number = MAX_DEVID; + } +} + +static int __init share_pool_init(void) +{ + /* lockless, as init kthread has no sp operation else */ + spg_none = create_spg(GROUP_NONE); + /* without free spg_none, not a serious problem */ + if (IS_ERR(spg_none) || !spg_none) + goto fail; + + sp_device_number_detect(); + + return 0; +fail: + pr_err("Ascend share pool initialization failed\n"); + static_branch_disable(&share_pool_enabled_key); + return 1; +} +late_initcall(share_pool_init);
From: Wang Wensheng wangwensheng4@huawei.com
ascend inclusion category: Feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NDAW CVE: NA
-------------------
Change the mmap_base in mm_struct and check the limit in get_unmapped_area.
Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Signed-off-by: Peng Wu wupeng58@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/share_pool.h | 12 +++++++++++- mm/mmap.c | 7 +++++++ mm/util.c | 4 ++++ 3 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 37a26487a7d8..3a56238c8a4d 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -262,6 +262,16 @@ static inline bool sp_is_enabled(void) return static_branch_likely(&share_pool_enabled_key); }
+static inline void sp_area_work_around(struct vm_unmapped_area_info *info, + unsigned long flags) +{ + /* the MAP_DVPP couldn't work with MAP_SHARE_POOL. In addition, the + * address ranges corresponding to the two flags must not overlap. + */ + if (sp_is_enabled() && !(flags & MAP_DVPP)) + info->high_limit = min(info->high_limit, MMAP_SHARE_POOL_START); +} + #else /* CONFIG_ASCEND_SHARE_POOL */
static inline int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id) @@ -442,7 +452,7 @@ static inline bool sp_is_enabled(void) return false; }
-static inline void sp_area_work_around(struct vm_unmapped_area_info *info) +static inline void sp_area_work_around(struct vm_unmapped_area_info *info, unsigned long flags) { }
diff --git a/mm/mmap.c b/mm/mmap.c index 0af1300734a2..d5a97a56dca7 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -48,6 +48,7 @@ #include <linux/oom.h> #include <linux/sched/mm.h> #include <linux/swapops.h> +#include <linux/share_pool.h>
#include <linux/uaccess.h> #include <asm/cacheflush.h> @@ -2450,6 +2451,8 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, if (enable_mmap_dvpp) dvpp_mmap_get_area(&info, flags);
+ sp_area_work_around(&info, flags); + return vm_unmapped_area(&info); } #endif @@ -2500,6 +2503,8 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, if (enable_mmap_dvpp) dvpp_mmap_get_area(&info, flags);
+ sp_area_work_around(&info, flags); + addr = vm_unmapped_area(&info);
/* @@ -2517,6 +2522,8 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, if (enable_mmap_dvpp) dvpp_mmap_get_area(&info, flags);
+ sp_area_work_around(&info, flags); + addr = vm_unmapped_area(&info); }
diff --git a/mm/util.c b/mm/util.c index 2350c064abc6..d31820abadb4 100644 --- a/mm/util.c +++ b/mm/util.c @@ -23,6 +23,7 @@ #include <linux/processor.h> #include <linux/sizes.h> #include <linux/compat.h> +#include <linux/share_pool.h>
#include <linux/uaccess.h> #include <linux/oom.h> @@ -392,6 +393,9 @@ static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) else if (gap > MAX_GAP) gap = MAX_GAP;
+ if (sp_is_enabled()) + return ALIGN_DOWN(MMAP_SHARE_POOL_START - rnd, PAGE_SIZE); + return PAGE_ALIGN(STACK_TOP - gap - rnd); }
From: Wang Wensheng wangwensheng4@huawei.com
ascend inclusion category: Feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NDAW CVE: NA
-------------------
1. /proc/sharepool/* those interfaces show the system-wide processes that are in the sharepool group and all the groups. 2. /proc/<pid>/sp_group expose the per-task sp_group state value.
Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/proc/base.c | 7 + mm/share_pool.c | 930 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 937 insertions(+)
diff --git a/fs/proc/base.c b/fs/proc/base.c index 752200879c80..7edbfd2ef757 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -96,6 +96,7 @@ #include <linux/posix-timers.h> #include <linux/time_namespace.h> #include <linux/resctrl.h> +#include <linux/share_pool.h> #include <trace/events/oom.h> #include "internal.h" #include "fd.h" @@ -3297,6 +3298,9 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_SECCOMP_CACHE_DEBUG ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), #endif +#ifdef CONFIG_ASCEND_SHARE_POOL + ONE("sp_group", 0444, proc_sp_group_state), +#endif };
static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) @@ -3631,6 +3635,9 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_SECCOMP_CACHE_DEBUG ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), #endif +#ifdef CONFIG_ASCEND_SHARE_POOL + ONE("sp_group", 0444, proc_sp_group_state), +#endif };
static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/mm/share_pool.c b/mm/share_pool.c index 67b1aad7d393..fe0e36a2214e 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -20,6 +20,37 @@ #define pr_fmt(fmt) "share pool: " fmt
#include <linux/share_pool.h> +#include <linux/sched.h> +#include <linux/sched/task.h> +#include <linux/sched/mm.h> +#include <linux/mm_types.h> +#include <linux/idr.h> +#include <linux/mutex.h> +#include <linux/rwsem.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/rbtree.h> +#include <linux/shmem_fs.h> +#include <linux/file.h> +#include <linux/printk.h> +#include <linux/hugetlb.h> +#include <linux/vmalloc.h> +#include <linux/pid.h> +#include <linux/pid_namespace.h> +#include <linux/atomic.h> +#include <linux/lockdep.h> +#include <linux/kernel.h> +#include <linux/falloc.h> +#include <linux/types.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/rmap.h> +#include <linux/compaction.h> +#include <linux/preempt.h> +#include <linux/swapops.h> +#include <linux/mmzone.h> +#include <linux/timekeeping.h> +#include <linux/time64.h>
/* access control mode macros */ #define AC_NONE 0 @@ -63,14 +94,406 @@ static bool is_sp_dev_addr_enabled(int device_id) return sp_dev_va_size[device_id]; }
+/* idr of all sp_groups */ +static DEFINE_IDR(sp_group_idr); +/* rw semaphore for sp_group_idr and mm->sp_group_master */ +static DECLARE_RWSEM(sp_group_sem); + static BLOCKING_NOTIFIER_HEAD(sp_notifier_chain);
+static DEFINE_IDA(sp_group_id_ida); + +/*** Statistical and maintenance tools ***/ + +/* idr of all sp_proc_stats */ +static DEFINE_IDR(sp_proc_stat_idr); +/* rw semaphore for sp_proc_stat_idr */ +static DECLARE_RWSEM(sp_proc_stat_sem); + +/* idr of all sp_spg_stats */ +static DEFINE_IDR(sp_spg_stat_idr); +/* rw semaphore for sp_spg_stat_idr */ +static DECLARE_RWSEM(sp_spg_stat_sem); + +/* for kthread buff_module_guard_work */ +static struct sp_proc_stat kthread_stat; + +/* The caller must hold sp_group_sem */ +static struct sp_group_master *sp_init_group_master_locked( + struct mm_struct *mm, bool *exist) +{ + struct sp_group_master *master = mm->sp_group_master; + + if (master) { + *exist = true; + return master; + } + + master = kmalloc(sizeof(struct sp_group_master), GFP_KERNEL); + if (master == NULL) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&master->node_list); + master->count = 0; + master->stat = NULL; + master->mm = mm; + mm->sp_group_master = master; + + *exist = false; + return master; +} + +static struct sp_proc_stat *create_proc_stat(struct mm_struct *mm, + struct task_struct *tsk) +{ + struct sp_proc_stat *stat; + + stat = kmalloc(sizeof(*stat), GFP_KERNEL); + if (stat == NULL) + return ERR_PTR(-ENOMEM); + + atomic_set(&stat->use_count, 1); + atomic64_set(&stat->alloc_size, 0); + atomic64_set(&stat->k2u_size, 0); + stat->tgid = tsk->tgid; + stat->mm = mm; + mutex_init(&stat->lock); + hash_init(stat->hash); + get_task_comm(stat->comm, tsk); + + return stat; +} + +static struct sp_proc_stat *sp_init_proc_stat(struct sp_group_master *master, + struct mm_struct *mm, struct task_struct *tsk) +{ + struct sp_proc_stat *stat; + int alloc_id, tgid = tsk->tgid; + + down_write(&sp_proc_stat_sem); + stat = master->stat; + if (stat) { + up_write(&sp_proc_stat_sem); + return stat; + } + + stat = create_proc_stat(mm, tsk); + if (IS_ERR(stat)) { + up_write(&sp_proc_stat_sem); + return stat; + } + + alloc_id = idr_alloc(&sp_proc_stat_idr, stat, tgid, tgid + 1, GFP_KERNEL); + if (alloc_id < 0) { + up_write(&sp_proc_stat_sem); + pr_err_ratelimited("proc stat idr alloc failed %d\n", alloc_id); + kfree(stat); + return ERR_PTR(alloc_id); + } + + master->stat = stat; + up_write(&sp_proc_stat_sem); + + return stat; +} + +static void update_spg_stat_alloc(unsigned long size, bool inc, + bool huge, struct sp_spg_stat *stat) +{ + if (inc) { + atomic_inc(&stat->spa_num); + atomic64_add(size, &stat->size); + atomic64_add(size, &stat->alloc_size); + if (huge) + atomic64_add(size, &stat->alloc_hsize); + else + atomic64_add(size, &stat->alloc_nsize); + } else { + atomic_dec(&stat->spa_num); + atomic64_sub(size, &stat->size); + atomic64_sub(size, &stat->alloc_size); + if (huge) + atomic64_sub(size, &stat->alloc_hsize); + else + atomic64_sub(size, &stat->alloc_nsize); + } +} + +static void update_spg_stat_k2u(unsigned long size, bool inc, + struct sp_spg_stat *stat) +{ + if (inc) { + atomic_inc(&stat->spa_num); + atomic64_add(size, &stat->size); + atomic64_add(size, &stat->k2u_size); + } else { + atomic_dec(&stat->spa_num); + atomic64_sub(size, &stat->size); + atomic64_sub(size, &stat->k2u_size); + } +} + +/* per process/sp-group memory usage statistics */ +struct spg_proc_stat { + int tgid; + int spg_id; /* 0 for non-group data, such as k2u_task */ + struct hlist_node pnode; /* hlist node in sp_proc_stat->hash */ + struct hlist_node gnode; /* hlist node in sp_spg_stat->hash */ + struct sp_proc_stat *proc_stat; + struct sp_spg_stat *spg_stat; + /* + * alloc amount minus free amount, may be negative when freed by + * another task in the same sp group. + */ + atomic64_t alloc_size; + atomic64_t k2u_size; +}; + +static void update_spg_proc_stat_alloc(unsigned long size, bool inc, + struct spg_proc_stat *stat) +{ + struct sp_proc_stat *proc_stat = stat->proc_stat; + + if (inc) { + atomic64_add(size, &stat->alloc_size); + atomic64_add(size, &proc_stat->alloc_size); + } else { + atomic64_sub(size, &stat->alloc_size); + atomic64_sub(size, &proc_stat->alloc_size); + } +} + +static void update_spg_proc_stat_k2u(unsigned long size, bool inc, + struct spg_proc_stat *stat) +{ + struct sp_proc_stat *proc_stat = stat->proc_stat; + + if (inc) { + atomic64_add(size, &stat->k2u_size); + atomic64_add(size, &proc_stat->k2u_size); + } else { + atomic64_sub(size, &stat->k2u_size); + atomic64_sub(size, &proc_stat->k2u_size); + } +} + +static struct spg_proc_stat *find_spg_proc_stat( + struct sp_proc_stat *proc_stat, int tgid, int spg_id) +{ + struct spg_proc_stat *stat = NULL; + + mutex_lock(&proc_stat->lock); + hash_for_each_possible(proc_stat->hash, stat, pnode, spg_id) { + if (stat->spg_id == spg_id) + break; + } + mutex_unlock(&proc_stat->lock); + + return stat; +} + +static struct spg_proc_stat *create_spg_proc_stat(int tgid, int spg_id) +{ + struct spg_proc_stat *stat; + + stat = kmalloc(sizeof(struct spg_proc_stat), GFP_KERNEL); + if (stat == NULL) + return ERR_PTR(-ENOMEM); + + stat->tgid = tgid; + stat->spg_id = spg_id; + atomic64_set(&stat->alloc_size, 0); + atomic64_set(&stat->k2u_size, 0); + + return stat; +} + +static struct spg_proc_stat *sp_init_spg_proc_stat( + struct sp_proc_stat *proc_stat, int tgid, struct sp_group *spg) +{ + struct spg_proc_stat *stat; + int spg_id = spg->id; /* visit spg id locklessly */ + struct sp_spg_stat *spg_stat = spg->stat; + + stat = find_spg_proc_stat(proc_stat, tgid, spg_id); + if (stat) + return stat; + + stat = create_spg_proc_stat(tgid, spg_id); + if (IS_ERR(stat)) + return stat; + + stat->proc_stat = proc_stat; + stat->spg_stat = spg_stat; + + mutex_lock(&proc_stat->lock); + hash_add(proc_stat->hash, &stat->pnode, stat->spg_id); + mutex_unlock(&proc_stat->lock); + + mutex_lock(&spg_stat->lock); + hash_add(spg_stat->hash, &stat->gnode, stat->tgid); + mutex_unlock(&spg_stat->lock); + return stat; +} + +/* + * The caller must + * 1. ensure no concurrency problem for task_struct and mm_struct. + * 2. hold sp_group_sem for sp_group_master (pay attention to ABBA deadlock) + */ +static struct spg_proc_stat *sp_init_process_stat(struct task_struct *tsk, + struct mm_struct *mm, struct sp_group *spg) +{ + struct sp_group_master *master; + bool exist; + struct sp_proc_stat *proc_stat; + struct spg_proc_stat *spg_proc_stat; + + master = sp_init_group_master_locked(mm, &exist); + if (IS_ERR(master)) + return (struct spg_proc_stat *)master; + + proc_stat = sp_init_proc_stat(master, mm, tsk); + if (IS_ERR(proc_stat)) + return (struct spg_proc_stat *)proc_stat; + + spg_proc_stat = sp_init_spg_proc_stat(proc_stat, tsk->tgid, spg); + return spg_proc_stat; +} + +static struct sp_spg_stat *create_spg_stat(int spg_id) +{ + struct sp_spg_stat *stat; + + stat = kmalloc(sizeof(*stat), GFP_KERNEL); + if (stat == NULL) + return ERR_PTR(-ENOMEM); + + stat->spg_id = spg_id; + atomic_set(&stat->hugepage_failures, 0); + atomic_set(&stat->spa_num, 0); + atomic64_set(&stat->size, 0); + atomic64_set(&stat->alloc_nsize, 0); + atomic64_set(&stat->alloc_hsize, 0); + atomic64_set(&stat->alloc_size, 0); + mutex_init(&stat->lock); + hash_init(stat->hash); + + return stat; +} + +static int sp_init_spg_stat(struct sp_group *spg) +{ + struct sp_spg_stat *stat; + int ret, spg_id = spg->id; + + stat = create_spg_stat(spg_id); + if (IS_ERR(stat)) + return PTR_ERR(stat); + + down_write(&sp_spg_stat_sem); + ret = idr_alloc(&sp_spg_stat_idr, stat, spg_id, spg_id + 1, + GFP_KERNEL); + up_write(&sp_spg_stat_sem); + if (ret < 0) { + pr_err_ratelimited("group %d idr alloc failed, ret %d\n", + spg_id, ret); + kfree(stat); + } + + spg->stat = stat; + return ret; +} + +static void free_spg_stat(int spg_id) +{ + struct sp_spg_stat *stat; + + down_write(&sp_spg_stat_sem); + stat = idr_remove(&sp_spg_stat_idr, spg_id); + up_write(&sp_spg_stat_sem); + WARN_ON(!stat); + kfree(stat); +} + /* * Group '0' for k2u_task and pass through. No process will be actually * added to. */ static struct sp_group *spg_none;
+/* statistics of all sp area, protected by sp_area_lock */ +struct sp_spa_stat { + unsigned int total_num; + unsigned int alloc_num; + unsigned int k2u_task_num; + unsigned int k2u_spg_num; + unsigned long total_size; + unsigned long alloc_size; + unsigned long k2u_task_size; + unsigned long k2u_spg_size; + unsigned long dvpp_size; + unsigned long dvpp_va_size; +}; + +static struct sp_spa_stat spa_stat; + +/* statistics of all sp group born from sp_alloc and k2u(spg) */ +struct sp_overall_stat { + atomic_t spa_total_num; + atomic64_t spa_total_size; +}; + +static struct sp_overall_stat sp_overall_stat; + +/*** Global share pool VA allocator ***/ + +enum spa_type { + SPA_TYPE_ALLOC = 1, + SPA_TYPE_K2TASK, + SPA_TYPE_K2SPG, +}; + +/* + * We bump the reference when each mmap succeeds, and it will be dropped + * when vma is about to release, so sp_area object will be automatically + * freed when all tasks in the sp group has exited. + */ +struct sp_area { + unsigned long va_start; + unsigned long va_end; /* va_end always align to hugepage */ + unsigned long real_size; /* real size with alignment */ + unsigned long region_vstart; /* belong to normal region or DVPP region */ + unsigned long flags; + bool is_hugepage; + bool is_dead; + atomic_t use_count; /* How many vmas use this VA region */ + struct rb_node rb_node; /* address sorted rbtree */ + struct list_head link; /* link to the spg->head */ + struct sp_group *spg; + enum spa_type type; /* where spa born from */ + struct mm_struct *mm; /* owner of k2u(task) */ + unsigned long kva; /* shared kva */ + pid_t applier; /* the original applier process */ + int node_id; /* memory node */ + int device_id; +}; +static DEFINE_SPINLOCK(sp_area_lock); +static struct rb_root sp_area_root = RB_ROOT; + +static unsigned long spa_size(struct sp_area *spa) +{ + return spa->real_size; +} + +static struct file *spa_file(struct sp_area *spa) +{ + if (spa->is_hugepage) + return spa->spg->file_hugetlb; + else + return spa->spg->file; +} + static struct sp_group *create_spg(int spg_id) { return NULL; @@ -137,6 +560,10 @@ int sp_group_add_task(int pid, int spg_id) } EXPORT_SYMBOL_GPL(sp_group_add_task);
+static void __sp_area_drop_locked(struct sp_area *spa) +{ +} + /** * mg_sp_group_del_task() - delete a process from a sp group. * @pid: the pid of the task to be deleted @@ -414,6 +841,508 @@ static int __init enable_sp_multi_group_mode(char *s) } __setup("enable_sp_multi_group_mode", enable_sp_multi_group_mode);
+/*** Statistical and maintenance functions ***/ + +static void free_process_spg_proc_stat(struct sp_proc_stat *proc_stat) +{ + int i; + struct spg_proc_stat *stat; + struct hlist_node *tmp; + struct sp_spg_stat *spg_stat; + + /* traverse proc_stat->hash locklessly as process is exiting */ + hash_for_each_safe(proc_stat->hash, i, tmp, stat, pnode) { + spg_stat = stat->spg_stat; + mutex_lock(&spg_stat->lock); + hash_del(&stat->gnode); + mutex_unlock(&spg_stat->lock); + + hash_del(&stat->pnode); + kfree(stat); + } +} + +static void free_sp_proc_stat(struct sp_proc_stat *stat) +{ + free_process_spg_proc_stat(stat); + + down_write(&sp_proc_stat_sem); + stat->mm->sp_group_master->stat = NULL; + idr_remove(&sp_proc_stat_idr, stat->tgid); + up_write(&sp_proc_stat_sem); + kfree(stat); +} + +/* the caller make sure stat is not NULL */ +void sp_proc_stat_drop(struct sp_proc_stat *stat) +{ + if (atomic_dec_and_test(&stat->use_count)) + free_sp_proc_stat(stat); +} + +static void get_mm_rss_info(struct mm_struct *mm, unsigned long *anon, + unsigned long *file, unsigned long *shmem, unsigned long *total_rss) +{ + *anon = get_mm_counter(mm, MM_ANONPAGES); + *file = get_mm_counter(mm, MM_FILEPAGES); + *shmem = get_mm_counter(mm, MM_SHMEMPAGES); + *total_rss = *anon + *file + *shmem; +} + +static long get_proc_alloc(struct sp_proc_stat *stat) +{ + return byte2kb(atomic64_read(&stat->alloc_size)); +} + +static long get_proc_k2u(struct sp_proc_stat *stat) +{ + return byte2kb(atomic64_read(&stat->k2u_size)); +} + +static long get_spg_alloc(struct sp_spg_stat *stat) +{ + return byte2kb(atomic64_read(&stat->alloc_size)); +} + +static long get_spg_alloc_nsize(struct sp_spg_stat *stat) +{ + return byte2kb(atomic64_read(&stat->alloc_nsize)); +} + +static long get_spg_proc_alloc(struct spg_proc_stat *stat) +{ + return byte2kb(atomic64_read(&stat->alloc_size)); +} + +static long get_spg_proc_k2u(struct spg_proc_stat *stat) +{ + return byte2kb(atomic64_read(&stat->k2u_size)); +} + +static void get_process_sp_res(struct sp_proc_stat *stat, + long *sp_res_out, long *sp_res_nsize_out) +{ + int i; + struct spg_proc_stat *spg_proc_stat; + struct sp_spg_stat *spg_stat; + long sp_res = 0, sp_res_nsize = 0; + + mutex_lock(&stat->lock); + hash_for_each(stat->hash, i, spg_proc_stat, pnode) { + spg_stat = spg_proc_stat->spg_stat; + sp_res += get_spg_alloc(spg_stat); + sp_res_nsize += get_spg_alloc_nsize(spg_stat); + } + mutex_unlock(&stat->lock); + + *sp_res_out = sp_res; + *sp_res_nsize_out = sp_res_nsize; +} + +/* + * Statistics of RSS has a maximum 64 pages deviation (256KB). + * Please check_sync_rss_stat(). + */ +static void get_process_non_sp_res(unsigned long total_rss, unsigned long shmem, + long sp_res_nsize, long *non_sp_res_out, long *non_sp_shm_out) +{ + long non_sp_res, non_sp_shm; + + non_sp_res = page2kb(total_rss) - sp_res_nsize; + non_sp_res = non_sp_res < 0 ? 0 : non_sp_res; + non_sp_shm = page2kb(shmem) - sp_res_nsize; + non_sp_shm = non_sp_shm < 0 ? 0 : non_sp_shm; + + *non_sp_res_out = non_sp_res; + *non_sp_shm_out = non_sp_shm; +} + +static long get_sp_res_by_spg_proc(struct spg_proc_stat *stat) +{ + return byte2kb(atomic64_read(&stat->spg_stat->alloc_size)); +} + +static unsigned long get_process_prot_locked(int spg_id, struct mm_struct *mm) +{ + unsigned long prot = 0; + struct sp_group_node *spg_node; + struct sp_group_master *master = mm->sp_group_master; + + list_for_each_entry(spg_node, &master->node_list, group_node) { + if (spg_node->spg->id == spg_id) { + prot = spg_node->prot; + break; + } + } + return prot; +} + +static void print_process_prot(struct seq_file *seq, unsigned long prot) +{ + if (prot == PROT_READ) + seq_puts(seq, "R"); + else if (prot == (PROT_READ | PROT_WRITE)) + seq_puts(seq, "RW"); + else /* e.g. spg_none */ + seq_puts(seq, "-"); +} + +int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + struct mm_struct *mm = task->mm; + struct sp_group_master *master; + struct sp_proc_stat *proc_stat; + struct spg_proc_stat *spg_proc_stat; + int i; + unsigned long anon, file, shmem, total_rss, prot; + long sp_res, sp_res_nsize, non_sp_res, non_sp_shm; + + if (!mm) + return 0; + + master = mm->sp_group_master; + if (!master) + return 0; + + get_mm_rss_info(mm, &anon, &file, &shmem, &total_rss); + proc_stat = master->stat; + get_process_sp_res(proc_stat, &sp_res, &sp_res_nsize); + get_process_non_sp_res(total_rss, shmem, sp_res_nsize, + &non_sp_res, &non_sp_shm); + + seq_puts(m, "Share Pool Aggregate Data of This Process\n\n"); + seq_printf(m, "%-8s %-16s %-9s %-9s %-9s %-10s %-10s %-8s\n", + "PID", "COMM", "SP_ALLOC", "SP_K2U", "SP_RES", "Non-SP_RES", + "Non-SP_Shm", "VIRT"); + seq_printf(m, "%-8d %-16s %-9ld %-9ld %-9ld %-10ld %-10ld %-8ld\n", + proc_stat->tgid, proc_stat->comm, + get_proc_alloc(proc_stat), + get_proc_k2u(proc_stat), + sp_res, non_sp_res, non_sp_shm, + page2kb(mm->total_vm)); + + seq_puts(m, "\n\nProcess in Each SP Group\n\n"); + seq_printf(m, "%-8s %-9s %-9s %-9s %-4s\n", + "Group_ID", "SP_ALLOC", "SP_K2U", "SP_RES", "PROT"); + + /* to prevent ABBA deadlock, first hold sp_group_sem */ + down_read(&sp_group_sem); + mutex_lock(&proc_stat->lock); + hash_for_each(proc_stat->hash, i, spg_proc_stat, pnode) { + prot = get_process_prot_locked(spg_proc_stat->spg_id, mm); + seq_printf(m, "%-8d %-9ld %-9ld %-9ld ", + spg_proc_stat->spg_id, + get_spg_proc_alloc(spg_proc_stat), + get_spg_proc_k2u(spg_proc_stat), + get_sp_res_by_spg_proc(spg_proc_stat)); + print_process_prot(m, prot); + seq_putc(m, '\n'); + } + mutex_unlock(&proc_stat->lock); + up_read(&sp_group_sem); + + return 0; +} + +static void rb_spa_stat_show(struct seq_file *seq) +{ + struct rb_node *node; + struct sp_area *spa, *prev = NULL; + + spin_lock(&sp_area_lock); + + for (node = rb_first(&sp_area_root); node; node = rb_next(node)) { + __sp_area_drop_locked(prev); + + spa = rb_entry(node, struct sp_area, rb_node); + prev = spa; + atomic_inc(&spa->use_count); + spin_unlock(&sp_area_lock); + + if (spa->spg == spg_none) /* k2u to task */ + seq_printf(seq, "%-10s ", "None"); + else { + down_read(&spa->spg->rw_lock); + if (spg_valid(spa->spg)) /* k2u to group */ + seq_printf(seq, "%-10d ", spa->spg->id); + else /* spg is dead */ + seq_printf(seq, "%-10s ", "Dead"); + up_read(&spa->spg->rw_lock); + } + + seq_printf(seq, "%2s%-14lx %2s%-14lx %-10ld ", + "0x", spa->va_start, + "0x", spa->va_end, + byte2kb(spa->real_size)); + + switch (spa->type) { + case SPA_TYPE_ALLOC: + seq_printf(seq, "%-7s ", "ALLOC"); + break; + case SPA_TYPE_K2TASK: + seq_printf(seq, "%-7s ", "TASK"); + break; + case SPA_TYPE_K2SPG: + seq_printf(seq, "%-7s ", "SPG"); + break; + default: + /* usually impossible, perhaps a developer's mistake */ + break; + } + + if (spa->is_hugepage) + seq_printf(seq, "%-5s ", "Y"); + else + seq_printf(seq, "%-5s ", "N"); + + seq_printf(seq, "%-8d ", spa->applier); + seq_printf(seq, "%-8d\n", atomic_read(&spa->use_count)); + + spin_lock(&sp_area_lock); + } + __sp_area_drop_locked(prev); + spin_unlock(&sp_area_lock); +} + +void spa_overview_show(struct seq_file *seq) +{ + unsigned int total_num, alloc_num, k2u_task_num, k2u_spg_num; + unsigned long total_size, alloc_size, k2u_task_size, k2u_spg_size; + unsigned long dvpp_size, dvpp_va_size; + + if (!sp_is_enabled()) + return; + + spin_lock(&sp_area_lock); + total_num = spa_stat.total_num; + alloc_num = spa_stat.alloc_num; + k2u_task_num = spa_stat.k2u_task_num; + k2u_spg_num = spa_stat.k2u_spg_num; + total_size = spa_stat.total_size; + alloc_size = spa_stat.alloc_size; + k2u_task_size = spa_stat.k2u_task_size; + k2u_spg_size = spa_stat.k2u_spg_size; + dvpp_size = spa_stat.dvpp_size; + dvpp_va_size = spa_stat.dvpp_va_size; + spin_unlock(&sp_area_lock); + + if (seq != NULL) { + seq_printf(seq, "Spa total num %u.\n", total_num); + seq_printf(seq, "Spa alloc num %u, k2u(task) num %u, k2u(spg) num %u.\n", + alloc_num, k2u_task_num, k2u_spg_num); + seq_printf(seq, "Spa total size: %13lu KB\n", byte2kb(total_size)); + seq_printf(seq, "Spa alloc size: %13lu KB\n", byte2kb(alloc_size)); + seq_printf(seq, "Spa k2u(task) size: %13lu KB\n", byte2kb(k2u_task_size)); + seq_printf(seq, "Spa k2u(spg) size: %13lu KB\n", byte2kb(k2u_spg_size)); + seq_printf(seq, "Spa dvpp size: %13lu KB\n", byte2kb(dvpp_size)); + seq_printf(seq, "Spa dvpp va size: %13lu MB\n", byte2mb(dvpp_va_size)); + seq_puts(seq, "\n"); + } else { + pr_info("Spa total num %u.\n", total_num); + pr_info("Spa alloc num %u, k2u(task) num %u, k2u(spg) num %u.\n", + alloc_num, k2u_task_num, k2u_spg_num); + pr_info("Spa total size: %13lu KB\n", byte2kb(total_size)); + pr_info("Spa alloc size: %13lu KB\n", byte2kb(alloc_size)); + pr_info("Spa k2u(task) size: %13lu KB\n", byte2kb(k2u_task_size)); + pr_info("Spa k2u(spg) size: %13lu KB\n", byte2kb(k2u_spg_size)); + pr_info("Spa dvpp size: %13lu KB\n", byte2kb(dvpp_size)); + pr_info("Spa dvpp va size: %13lu MB\n", byte2mb(dvpp_va_size)); + pr_info("\n"); + } +} + +/* the caller must hold sp_group_sem */ +static int idr_spg_stat_cb(int id, void *p, void *data) +{ + struct sp_spg_stat *s = p; + struct seq_file *seq = data; + + if (seq != NULL) { + if (id == 0) + seq_puts(seq, "Non Group "); + else + seq_printf(seq, "Group %6d ", id); + + seq_printf(seq, "size: %lld KB, spa num: %d, total alloc: %lld KB, normal alloc: %lld KB, huge alloc: %lld KB\n", + byte2kb(atomic64_read(&s->size)), + atomic_read(&s->spa_num), + byte2kb(atomic64_read(&s->alloc_size)), + byte2kb(atomic64_read(&s->alloc_nsize)), + byte2kb(atomic64_read(&s->alloc_hsize))); + } else { + if (id == 0) + pr_info("Non Group "); + else + pr_info("Group %6d ", id); + + pr_info("size: %lld KB, spa num: %d, total alloc: %lld KB, normal alloc: %lld KB, huge alloc: %lld KB\n", + byte2kb(atomic64_read(&s->size)), + atomic_read(&s->spa_num), + byte2kb(atomic64_read(&s->alloc_size)), + byte2kb(atomic64_read(&s->alloc_nsize)), + byte2kb(atomic64_read(&s->alloc_hsize))); + } + + return 0; +} + +void spg_overview_show(struct seq_file *seq) +{ + if (!sp_is_enabled()) + return; + + if (seq != NULL) { + seq_printf(seq, "Share pool total size: %lld KB, spa total num: %d.\n", + byte2kb(atomic64_read(&sp_overall_stat.spa_total_size)), + atomic_read(&sp_overall_stat.spa_total_num)); + } else { + pr_info("Share pool total size: %lld KB, spa total num: %d.\n", + byte2kb(atomic64_read(&sp_overall_stat.spa_total_size)), + atomic_read(&sp_overall_stat.spa_total_num)); + } + + down_read(&sp_group_sem); + idr_for_each(&sp_spg_stat_idr, idr_spg_stat_cb, seq); + up_read(&sp_group_sem); + + if (seq != NULL) + seq_puts(seq, "\n"); + else + pr_info("\n"); +} + +static int spa_stat_show(struct seq_file *seq, void *offset) +{ + spg_overview_show(seq); + spa_overview_show(seq); + /* print the file header */ + seq_printf(seq, "%-10s %-16s %-16s %-10s %-7s %-5s %-8s %-8s\n", + "Group ID", "va_start", "va_end", "Size(KB)", "Type", "Huge", "PID", "Ref"); + rb_spa_stat_show(seq); + return 0; +} + +static int idr_proc_stat_cb(int id, void *p, void *data) +{ + struct sp_spg_stat *spg_stat = p; + struct seq_file *seq = data; + int i, tgid; + struct sp_proc_stat *proc_stat; + struct spg_proc_stat *spg_proc_stat; + + struct mm_struct *mm; + unsigned long anon, file, shmem, total_rss, prot; + /* + * non_sp_res: resident memory size excluding share pool memory + * sp_res: resident memory size of share pool, including normal + * page and hugepage memory + * non_sp_shm: resident shared memory size excluding share pool + * memory + */ + long sp_res, sp_res_nsize, non_sp_res, non_sp_shm; + + /* to prevent ABBA deadlock, first hold sp_group_sem */ + down_read(&sp_group_sem); + mutex_lock(&spg_stat->lock); + hash_for_each(spg_stat->hash, i, spg_proc_stat, gnode) { + proc_stat = spg_proc_stat->proc_stat; + tgid = proc_stat->tgid; + mm = proc_stat->mm; + + get_mm_rss_info(mm, &anon, &file, &shmem, &total_rss); + get_process_sp_res(proc_stat, &sp_res, &sp_res_nsize); + get_process_non_sp_res(total_rss, shmem, sp_res_nsize, + &non_sp_res, &non_sp_shm); + prot = get_process_prot_locked(id, mm); + + seq_printf(seq, "%-8d ", tgid); + if (id == 0) + seq_printf(seq, "%-8c ", '-'); + else + seq_printf(seq, "%-8d ", id); + seq_printf(seq, "%-9ld %-9ld %-9ld %-10ld %-10ld %-8ld %-7ld %-7ld %-10ld ", + get_spg_proc_alloc(spg_proc_stat), + get_spg_proc_k2u(spg_proc_stat), + get_sp_res_by_spg_proc(spg_proc_stat), + sp_res, non_sp_res, + page2kb(mm->total_vm), page2kb(total_rss), + page2kb(shmem), non_sp_shm); + print_process_prot(seq, prot); + seq_putc(seq, '\n'); + } + mutex_unlock(&spg_stat->lock); + up_read(&sp_group_sem); + return 0; +} + +static int proc_stat_show(struct seq_file *seq, void *offset) +{ + spg_overview_show(seq); + spa_overview_show(seq); + /* print the file header */ + seq_printf(seq, "%-8s %-8s %-9s %-9s %-9s %-10s %-10s %-8s %-7s %-7s %-10s %-4s\n", + "PID", "Group_ID", "SP_ALLOC", "SP_K2U", "SP_RES", "SP_RES_T", + "Non-SP_RES", "VIRT", "RES", "Shm", "Non-SP_Shm", "PROT"); + /* print kthread buff_module_guard_work */ + seq_printf(seq, "%-8s %-8s %-9lld %-9lld\n", + "guard", "-", + byte2kb(atomic64_read(&kthread_stat.alloc_size)), + byte2kb(atomic64_read(&kthread_stat.k2u_size))); + + /* pay attention to potential ABBA deadlock */ + down_read(&sp_spg_stat_sem); + idr_for_each(&sp_spg_stat_idr, idr_proc_stat_cb, seq); + up_read(&sp_spg_stat_sem); + return 0; +} + +static int idr_proc_overview_cb(int id, void *p, void *data) +{ + struct sp_proc_stat *proc_stat = p; + struct seq_file *seq = data; + struct mm_struct *mm = proc_stat->mm; + unsigned long anon, file, shmem, total_rss; + long sp_res, sp_res_nsize, non_sp_res, non_sp_shm; + + get_mm_rss_info(mm, &anon, &file, &shmem, &total_rss); + get_process_sp_res(proc_stat, &sp_res, &sp_res_nsize); + get_process_non_sp_res(total_rss, shmem, sp_res_nsize, + &non_sp_res, &non_sp_shm); + + seq_printf(seq, "%-8d %-16s %-9ld %-9ld %-9ld %-10ld %-10ld %-8ld\n", + id, proc_stat->comm, + get_proc_alloc(proc_stat), + get_proc_k2u(proc_stat), + sp_res, non_sp_res, non_sp_shm, + page2kb(mm->total_vm)); + return 0; +} + +static int proc_overview_show(struct seq_file *seq, void *offset) +{ + seq_printf(seq, "%-8s %-16s %-9s %-9s %-9s %-10s %-10s %-8s\n", + "PID", "COMM", "SP_ALLOC", "SP_K2U", "SP_RES", "Non-SP_RES", + "Non-SP_Shm", "VIRT"); + + down_read(&sp_proc_stat_sem); + idr_for_each(&sp_proc_stat_idr, idr_proc_overview_cb, seq); + up_read(&sp_proc_stat_sem); + return 0; +} + +static void __init proc_sharepool_init(void) +{ + if (!proc_mkdir("sharepool", NULL)) + return; + + proc_create_single_data("sharepool/proc_stat", 0400, NULL, proc_stat_show, NULL); + proc_create_single_data("sharepool/spa_stat", 0400, NULL, spa_stat_show, NULL); + proc_create_single_data("sharepool/proc_overview", 0400, NULL, proc_overview_show, NULL); +} + +/*** End of tatistical and maintenance functions ***/ + DEFINE_STATIC_KEY_FALSE(share_pool_enabled_key);
static int __init enable_share_pool(char *s) @@ -446,6 +1375,7 @@ static int __init share_pool_init(void) goto fail;
sp_device_number_detect(); + proc_sharepool_init();
return 0; fail:
From: Wang Wensheng wangwensheng4@huawei.com
ascend inclusion category: Feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NDAW CVE: NA
-------------------
This flag specify that the vm_area_struct is related to share_pool (in detail, has a spa corresponding to it).
Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/mm.h | 6 ++++++ include/linux/share_pool.h | 8 ++++++++ 2 files changed, 14 insertions(+)
diff --git a/include/linux/mm.h b/include/linux/mm.h index a4996e11cda6..e2eb3ea63d6a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -378,6 +378,12 @@ extern unsigned int kobjsize(const void *objp); # define VM_MTE_ALLOWED VM_NONE #endif
+#if defined(CONFIG_ASCEND_SHARE_POOL) +# define VM_SHARE_POOL VM_HIGH_ARCH_4 +#else +# define VM_SHARE_POOL VM_NONE +#endif + #ifndef VM_GROWSUP # define VM_GROWSUP VM_NONE #endif diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 3a56238c8a4d..c39780ab5a87 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -272,6 +272,14 @@ static inline void sp_area_work_around(struct vm_unmapped_area_info *info, info->high_limit = min(info->high_limit, MMAP_SHARE_POOL_START); }
+static inline bool sp_check_vm_share_pool(unsigned long vm_flags) +{ + if (sp_is_enabled() && (vm_flags & VM_SHARE_POOL)) + return true; + + return false; +} + #else /* CONFIG_ASCEND_SHARE_POOL */
static inline int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id)
From: Wang Wensheng wangwensheng4@huawei.com
ascend inclusion category: Feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NDAW CVE: NA
-------------------
This is a simple wrap of walk_page_range() to get all the pages of a spa. It doesn't support holes.
Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/share_pool.c | 243 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 241 insertions(+), 2 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index fe0e36a2214e..28bf0de8813b 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -51,6 +51,7 @@ #include <linux/mmzone.h> #include <linux/timekeeping.h> #include <linux/time64.h> +#include <linux/pagewalk.h>
/* access control mode macros */ #define AC_NONE 0 @@ -494,6 +495,12 @@ static struct file *spa_file(struct sp_area *spa) return spa->spg->file; }
+static inline void check_interrupt_context(void) +{ + if (unlikely(in_interrupt())) + panic("function can't be used in interrupt context\n"); +} + static struct sp_group *create_spg(int spg_id) { return NULL; @@ -664,6 +671,201 @@ void *mg_sp_make_share_k2u(unsigned long kva, unsigned long size, } EXPORT_SYMBOL_GPL(mg_sp_make_share_k2u);
+static int sp_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct sp_walk_data *sp_walk_data = walk->private; + + sp_walk_data->pmd = pmd; + return 0; +} + +static int sp_pte_entry(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct page *page; + struct sp_walk_data *sp_walk_data = walk->private; + pmd_t *pmd = sp_walk_data->pmd; + +retry: + if (unlikely(!pte_present(*pte))) { + swp_entry_t entry; + + if (pte_none(*pte)) + goto no_page; + entry = pte_to_swp_entry(*pte); + if (!is_migration_entry(entry)) + goto no_page; + migration_entry_wait(walk->mm, pmd, addr); + goto retry; + } + + page = pte_page(*pte); + get_page(page); + sp_walk_data->pages[sp_walk_data->page_count++] = page; + return 0; + +no_page: + pr_debug("the page of addr %lx unexpectedly not in RAM\n", + (unsigned long)addr); + return -EFAULT; +} + +static int sp_test_walk(unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + /* + * FIXME: The devmm driver uses remap_pfn_range() but actually there + * are associated struct pages, so they should use vm_map_pages() or + * similar APIs. Before the driver has been converted to correct APIs + * we use this test_walk() callback so we can treat VM_PFNMAP VMAs as + * normal VMAs. + */ + return 0; +} + +static int sp_pte_hole(unsigned long start, unsigned long end, + int depth, struct mm_walk *walk) +{ + pr_debug("hole [%lx, %lx) appeared unexpectedly\n", (unsigned long)start, (unsigned long)end); + return -EFAULT; +} + +static int sp_hugetlb_entry(pte_t *ptep, unsigned long hmask, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + pte_t pte = huge_ptep_get(ptep); + struct page *page = pte_page(pte); + struct sp_walk_data *sp_walk_data; + + if (unlikely(!pte_present(pte))) { + pr_debug("the page of addr %lx unexpectedly not in RAM\n", (unsigned long)addr); + return -EFAULT; + } + + sp_walk_data = walk->private; + get_page(page); + sp_walk_data->pages[sp_walk_data->page_count++] = page; + return 0; +} + +/* + * __sp_walk_page_range() - Walk page table with caller specific callbacks. + * @uva: the start VA of user memory. + * @size: the size of user memory. + * @mm: mm struct of the target task. + * @sp_walk_data: a structure of a page pointer array. + * + * the caller must hold mm->mmap_lock + * + * Notes for parameter alignment: + * When size == 0, let it be page_size, so that at least one page is walked. + * + * When size > 0, for convenience, usually the parameters of uva and + * size are not page aligned. There are four different alignment scenarios and + * we must handler all of them correctly. + * + * The basic idea is to align down uva and align up size so all the pages + * in range [uva, uva + size) are walked. However, there are special cases. + * + * Considering a 2M-hugepage addr scenario. Assuming the caller wants to + * traverse range [1001M, 1004.5M), so uva and size is 1001M and 3.5M + * accordingly. The aligned-down uva is 1000M and the aligned-up size is 4M. + * The traverse range will be [1000M, 1004M). Obviously, the final page for + * [1004M, 1004.5M) is not covered. + * + * To fix this problem, we need to walk an additional page, size should be + * ALIGN(uva+size) - uva_aligned + */ +static int __sp_walk_page_range(unsigned long uva, unsigned long size, + struct mm_struct *mm, struct sp_walk_data *sp_walk_data) +{ + int ret = 0; + struct vm_area_struct *vma; + unsigned long page_nr; + struct page **pages = NULL; + bool is_hugepage = false; + unsigned long uva_aligned; + unsigned long size_aligned; + unsigned int page_size = PAGE_SIZE; + struct mm_walk_ops sp_walk = {}; + + /* + * Here we also support non share pool memory in this interface + * because the caller can't distinguish whether a uva is from the + * share pool or not. It is not the best idea to do so, but currently + * it simplifies overall design. + * + * In this situation, the correctness of the parameters is mainly + * guaranteed by the caller. + */ + vma = find_vma(mm, uva); + if (!vma) { + pr_debug("u2k input uva %lx is invalid\n", (unsigned long)uva); + return -EINVAL; + } + if (is_vm_hugetlb_page(vma)) + is_hugepage = true; + + sp_walk.pte_hole = sp_pte_hole; + sp_walk.test_walk = sp_test_walk; + if (is_hugepage) { + sp_walk_data->is_hugepage = true; + sp_walk.hugetlb_entry = sp_hugetlb_entry; + page_size = PMD_SIZE; + } else { + sp_walk_data->is_hugepage = false; + sp_walk.pte_entry = sp_pte_entry; + sp_walk.pmd_entry = sp_pmd_entry; + } + + sp_walk_data->page_size = page_size; + uva_aligned = ALIGN_DOWN(uva, page_size); + sp_walk_data->uva_aligned = uva_aligned; + if (size == 0) + size_aligned = page_size; + else + /* special alignment handling */ + size_aligned = ALIGN(uva + size, page_size) - uva_aligned; + + if (uva_aligned + size_aligned < uva_aligned) { + pr_err_ratelimited("overflow happened in walk page range\n"); + return -EINVAL; + } + + page_nr = size_aligned / page_size; + pages = kvmalloc(page_nr * sizeof(struct page *), GFP_KERNEL); + if (!pages) { + pr_err_ratelimited("alloc page array failed in walk page range\n"); + return -ENOMEM; + } + sp_walk_data->pages = pages; + + ret = walk_page_range(mm, uva_aligned, uva_aligned + size_aligned, + &sp_walk, sp_walk_data); + if (ret) + kvfree(pages); + + return ret; +} + +static void __sp_walk_page_free(struct sp_walk_data *data) +{ + int i = 0; + struct page *page; + + while (i < data->page_count) { + page = data->pages[i++]; + put_page(page); + } + + kvfree(data->pages); + /* prevent repeated release */ + data->page_count = 0; + data->pages = NULL; +} + /** * sp_make_share_u2k() - Share user memory of a specified process to kernel. * @uva: the VA of shared user memory @@ -723,7 +925,39 @@ EXPORT_SYMBOL_GPL(mg_sp_unshare); int sp_walk_page_range(unsigned long uva, unsigned long size, struct task_struct *tsk, struct sp_walk_data *sp_walk_data) { - return 0; + struct mm_struct *mm; + int ret = 0; + + check_interrupt_context(); + + if (unlikely(!sp_walk_data)) { + pr_err_ratelimited("null pointer when walk page range\n"); + return -EINVAL; + } + if (!tsk || (tsk->flags & PF_EXITING)) + return -ESRCH; + + get_task_struct(tsk); + mm = get_task_mm(tsk); + if (!mm) { + put_task_struct(tsk); + return -ESRCH; + } + + sp_walk_data->page_count = 0; + down_write(&mm->mmap_lock); + if (likely(!mm->core_state)) + ret = __sp_walk_page_range(uva, size, mm, sp_walk_data); + else { + pr_err("walk page range: encoutered coredump\n"); + ret = -ESRCH; + } + up_write(&mm->mmap_lock); + + mmput(mm); + put_task_struct(tsk); + + return ret; } EXPORT_SYMBOL_GPL(sp_walk_page_range);
@@ -740,7 +974,12 @@ EXPORT_SYMBOL_GPL(mg_sp_walk_page_range); */ void sp_walk_page_free(struct sp_walk_data *sp_walk_data) { - return; + check_interrupt_context(); + + if (!sp_walk_data) + return; + + __sp_walk_page_free(sp_walk_data); } EXPORT_SYMBOL_GPL(sp_walk_page_free);
From: Wang Wensheng wangwensheng4@huawei.com
ascend inclusion category: Feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NDAW CVE: NA
-------------------
Sp-area specify a memory region used for share_pool. Add alloc/get/drop operations for sp-area.
Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Peng Wu wupeng58@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/share_pool.h | 8 + kernel/sysctl.c | 12 + mm/share_pool.c | 463 +++++++++++++++++++++++++++++++++++-- 3 files changed, 463 insertions(+), 20 deletions(-)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index c39780ab5a87..7e7ced34be57 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -257,6 +257,8 @@ extern bool mg_is_sharepool_addr(unsigned long addr); extern int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id); extern int sp_group_add_task(int pid, int spg_id);
+extern void sp_area_drop(struct vm_area_struct *vma); + static inline bool sp_is_enabled(void) { return static_branch_likely(&share_pool_enabled_key); @@ -280,6 +282,12 @@ static inline bool sp_check_vm_share_pool(unsigned long vm_flags) return false; }
+static inline void sp_dump_stack(void) +{ + if (sysctl_sp_debug_mode) + dump_stack(); +} + #else /* CONFIG_ASCEND_SHARE_POOL */
static inline int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ce200213ccbd..8f417c7b12e8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -71,6 +71,7 @@ #include <linux/coredump.h> #include <linux/latencytop.h> #include <linux/pid.h> +#include <linux/share_pool.h>
#include "../lib/kstrtox.h"
@@ -3257,6 +3258,17 @@ static struct ctl_table vm_table[] = { .proc_handler = proc_shrink_node_caches, .extra1 = SYSCTL_ZERO, }, +#endif +#ifdef CONFIG_ASCEND_SHARE_POOL + { + .procname = "sharepool_debug_mode", + .data = &sysctl_sp_debug_mode, + .maxlen = sizeof(sysctl_sp_debug_mode), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, #endif { } }; diff --git a/mm/share_pool.c b/mm/share_pool.c index 28bf0de8813b..0ea113b904cf 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -84,6 +84,9 @@ static const int mdc_default_group_id = 1; /* share the uva to the whole group */ static int __read_mostly enable_share_k2u_spg = 1;
+/* debug mode */ +int sysctl_sp_debug_mode; + static int share_pool_group_mode = SINGLE_GROUP_MODE;
static unsigned int sp_device_number; @@ -495,32 +498,136 @@ static struct file *spa_file(struct sp_area *spa) return spa->spg->file; }
-static inline void check_interrupt_context(void) -{ - if (unlikely(in_interrupt())) - panic("function can't be used in interrupt context\n"); +/* the caller should hold sp_area_lock */ +static void spa_inc_usage(struct sp_area *spa) +{ + enum spa_type type = spa->type; + unsigned long size = spa->real_size; + bool is_dvpp = spa->flags & SP_DVPP; + bool is_huge = spa->is_hugepage; + + switch (type) { + case SPA_TYPE_ALLOC: + spa_stat.alloc_num += 1; + spa_stat.alloc_size += size; + update_spg_stat_alloc(size, true, is_huge, spa->spg->stat); + break; + case SPA_TYPE_K2TASK: + spa_stat.k2u_task_num += 1; + spa_stat.k2u_task_size += size; + update_spg_stat_k2u(size, true, spg_none->stat); + break; + case SPA_TYPE_K2SPG: + spa_stat.k2u_spg_num += 1; + spa_stat.k2u_spg_size += size; + update_spg_stat_k2u(size, true, spa->spg->stat); + break; + default: + WARN(1, "invalid spa type"); + } + + if (is_dvpp) { + spa_stat.dvpp_size += size; + spa_stat.dvpp_va_size += ALIGN(size, PMD_SIZE); + } + + /* + * all the calculations won't overflow due to system limitation and + * parameter checking in sp_alloc_area() + */ + spa_stat.total_num += 1; + spa_stat.total_size += size; + + if (spa->spg != spg_none) { + atomic_inc(&sp_overall_stat.spa_total_num); + atomic64_add(size, &sp_overall_stat.spa_total_size); + } }
-static struct sp_group *create_spg(int spg_id) -{ - return NULL; +/* the caller should hold sp_area_lock */ +static void spa_dec_usage(struct sp_area *spa) +{ + enum spa_type type = spa->type; + unsigned long size = spa->real_size; + bool is_dvpp = spa->flags & SP_DVPP; + bool is_huge = spa->is_hugepage; + + switch (type) { + case SPA_TYPE_ALLOC: + spa_stat.alloc_num -= 1; + spa_stat.alloc_size -= size; + update_spg_stat_alloc(size, false, is_huge, spa->spg->stat); + break; + case SPA_TYPE_K2TASK: + spa_stat.k2u_task_num -= 1; + spa_stat.k2u_task_size -= size; + update_spg_stat_k2u(size, false, spg_none->stat); + break; + case SPA_TYPE_K2SPG: + spa_stat.k2u_spg_num -= 1; + spa_stat.k2u_spg_size -= size; + update_spg_stat_k2u(size, false, spa->spg->stat); + break; + default: + WARN(1, "invalid spa type"); + } + + if (is_dvpp) { + spa_stat.dvpp_size -= size; + spa_stat.dvpp_va_size -= ALIGN(size, PMD_SIZE); + } + + spa_stat.total_num -= 1; + spa_stat.total_size -= size; + + if (spa->spg != spg_none) { + atomic_dec(&sp_overall_stat.spa_total_num); + atomic64_sub(spa->real_size, &sp_overall_stat.spa_total_size); + } }
-static bool is_online_node_id(int node_id) +static void update_spg_proc_stat(unsigned long size, bool inc, + struct spg_proc_stat *stat, enum spa_type type) { - return node_id >= 0 && node_id < MAX_NUMNODES && node_online(node_id); + if (unlikely(!stat)) { + sp_dump_stack(); + WARN(1, "null process stat\n"); + return; + } + + switch (type) { + case SPA_TYPE_ALLOC: + update_spg_proc_stat_alloc(size, inc, stat); + break; + case SPA_TYPE_K2TASK: + case SPA_TYPE_K2SPG: + update_spg_proc_stat_k2u(size, inc, stat); + break; + default: + WARN(1, "invalid stat type\n"); + } }
-static bool is_device_addr(unsigned long addr) +static void sp_update_process_stat(struct task_struct *tsk, bool inc, + struct sp_area *spa) { - int i; + struct spg_proc_stat *stat; + unsigned long size = spa->real_size; + enum spa_type type = spa->type;
- for (i = 0; i < sp_device_number; i++) { - if (addr >= sp_dev_va_start[i] && - addr < sp_dev_va_start[i] + sp_dev_va_size[i]) - return true; - } - return false; + down_write(&sp_group_sem); + stat = sp_init_process_stat(tsk, tsk->mm, spa->spg); + up_write(&sp_group_sem); + if (unlikely(IS_ERR(stat))) + return; + + update_spg_proc_stat(size, inc, stat, type); +} + +static inline void check_interrupt_context(void) +{ + if (unlikely(in_interrupt())) + panic("function can't be used in interrupt context\n"); }
/** @@ -555,6 +662,28 @@ int mg_sp_group_id_by_pid(int pid, int *spg_ids, int *num) } EXPORT_SYMBOL_GPL(mg_sp_group_id_by_pid);
+static bool is_online_node_id(int node_id) +{ + return node_id >= 0 && node_id < MAX_NUMNODES && node_online(node_id); +} + +static bool is_device_addr(unsigned long addr) +{ + int i; + + for (i = 0; i < sp_device_number; i++) { + if (addr >= sp_dev_va_start[i] && + addr < sp_dev_va_start[i] + sp_dev_va_size[i]) + return true; + } + return false; +} + +static struct sp_group *create_spg(int spg_id) +{ + return NULL; +} + int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id) { return 0; @@ -567,9 +696,7 @@ int sp_group_add_task(int pid, int spg_id) } EXPORT_SYMBOL_GPL(sp_group_add_task);
-static void __sp_area_drop_locked(struct sp_area *spa) -{ -} +static void __sp_area_drop_locked(struct sp_area *spa);
/** * mg_sp_group_del_task() - delete a process from a sp group. @@ -595,6 +722,302 @@ int sp_group_del_task(int pid, int spg_id) } EXPORT_SYMBOL_GPL(sp_group_del_task);
+/* the caller must hold sp_area_lock */ +static void __insert_sp_area(struct sp_area *spa) +{ + struct rb_node **p = &sp_area_root.rb_node; + struct rb_node *parent = NULL; + + while (*p) { + struct sp_area *tmp; + + parent = *p; + tmp = rb_entry(parent, struct sp_area, rb_node); + if (spa->va_start < tmp->va_end) + p = &(*p)->rb_left; + else if (spa->va_end > tmp->va_start) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&spa->rb_node, parent, p); + rb_insert_color(&spa->rb_node, &sp_area_root); +} + +/* The sp_area cache globals are protected by sp_area_lock */ +static struct rb_node *free_sp_area_cache; +static unsigned long cached_hole_size; +static unsigned long cached_vstart; /* affected by SP_DVPP and sp_config_dvpp_range() */ + +/** + * sp_alloc_area() - Allocate a region of VA from the share pool. + * @size: the size of VA to allocate. + * @flags: how to allocate the memory. + * @spg: the share group that the memory is allocated to. + * @type: the type of the region. + * @applier: the pid of the task which allocates the region. + * + * Return: a valid pointer for success, NULL on failure. + */ +static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, + struct sp_group *spg, enum spa_type type, + pid_t applier) +{ + struct sp_area *spa, *first, *err; + struct rb_node *n; + unsigned long vstart = MMAP_SHARE_POOL_START; + unsigned long vend = MMAP_SHARE_POOL_16G_START; + unsigned long addr; + unsigned long size_align = ALIGN(size, PMD_SIZE); /* va aligned to 2M */ + int device_id, node_id; + + device_id = sp_flags_device_id(flags); + node_id = flags & SP_SPEC_NODE_ID ? sp_flags_node_id(flags) : device_id; + + if (!is_online_node_id(node_id)) { + pr_err_ratelimited("invalid numa node id %d\n", node_id); + return ERR_PTR(-EINVAL); + } + + if ((flags & SP_DVPP)) { + if (!is_sp_dev_addr_enabled(device_id)) { + vstart = MMAP_SHARE_POOL_16G_START + + device_id * MMAP_SHARE_POOL_16G_SIZE; + vend = vstart + MMAP_SHARE_POOL_16G_SIZE; + } else { + vstart = sp_dev_va_start[device_id]; + vend = vstart + sp_dev_va_size[device_id]; + } + } + + spa = __kmalloc_node(sizeof(struct sp_area), GFP_KERNEL, node_id); + if (unlikely(!spa)) + return ERR_PTR(-ENOMEM); + + spin_lock(&sp_area_lock); + + /* + * Invalidate cache if we have more permissive parameters. + * cached_hole_size notes the largest hole noticed _below_ + * the sp_area cached in free_sp_area_cache: if size fits + * into that hole, we want to scan from vstart to reuse + * the hole instead of allocating above free_sp_area_cache. + * Note that sp_free_area may update free_sp_area_cache + * without updating cached_hole_size. + */ + if (!free_sp_area_cache || size_align < cached_hole_size || + vstart != cached_vstart) { + cached_hole_size = 0; + free_sp_area_cache = NULL; + } + + /* record if we encounter less permissive parameters */ + cached_vstart = vstart; + + /* find starting point for our search */ + if (free_sp_area_cache) { + first = rb_entry(free_sp_area_cache, struct sp_area, rb_node); + addr = first->va_end; + if (addr + size_align < addr) { + err = ERR_PTR(-EOVERFLOW); + goto error; + } + } else { + addr = vstart; + if (addr + size_align < addr) { + err = ERR_PTR(-EOVERFLOW); + goto error; + } + + n = sp_area_root.rb_node; + first = NULL; + + while (n) { + struct sp_area *tmp; + + tmp = rb_entry(n, struct sp_area, rb_node); + if (tmp->va_end >= addr) { + first = tmp; + if (tmp->va_start <= addr) + break; + n = n->rb_left; + } else + n = n->rb_right; + } + + if (!first) + goto found; + } + + /* from the starting point, traverse areas until a suitable hole is found */ + while (addr + size_align > first->va_start && addr + size_align <= vend) { + if (addr + cached_hole_size < first->va_start) + cached_hole_size = first->va_start - addr; + addr = first->va_end; + if (addr + size_align < addr) { + err = ERR_PTR(-EOVERFLOW); + goto error; + } + + n = rb_next(&first->rb_node); + if (n) + first = rb_entry(n, struct sp_area, rb_node); + else + goto found; + } + +found: + if (addr + size_align > vend) { + err = ERR_PTR(-EOVERFLOW); + goto error; + } + + spa->va_start = addr; + spa->va_end = addr + size_align; + spa->real_size = size; + spa->region_vstart = vstart; + spa->flags = flags; + spa->is_hugepage = (flags & SP_HUGEPAGE); + spa->is_dead = false; + spa->spg = spg; + atomic_set(&spa->use_count, 1); + spa->type = type; + spa->mm = NULL; + spa->kva = 0; /* NULL pointer */ + spa->applier = applier; + spa->node_id = node_id; + spa->device_id = device_id; + + spa_inc_usage(spa); + __insert_sp_area(spa); + free_sp_area_cache = &spa->rb_node; + if (spa->spg != spg_none) + list_add_tail(&spa->link, &spg->spa_list); + + spin_unlock(&sp_area_lock); + + return spa; + +error: + spin_unlock(&sp_area_lock); + kfree(spa); + return err; +} + +/* the caller should hold sp_area_lock */ +static struct sp_area *__find_sp_area_locked(unsigned long addr) +{ + struct rb_node *n = sp_area_root.rb_node; + + while (n) { + struct sp_area *spa; + + spa = rb_entry(n, struct sp_area, rb_node); + if (addr < spa->va_start) { + n = n->rb_left; + } else if (addr > spa->va_start) { + n = n->rb_right; + } else { + return spa; + } + } + + return NULL; +} + +static struct sp_area *__find_sp_area(unsigned long addr) +{ + struct sp_area *n; + + spin_lock(&sp_area_lock); + n = __find_sp_area_locked(addr); + if (n) + atomic_inc(&n->use_count); + spin_unlock(&sp_area_lock); + return n; +} + +/* + * Free the VA region starting from addr to the share pool + */ +static void sp_free_area(struct sp_area *spa) +{ + lockdep_assert_held(&sp_area_lock); + + if (free_sp_area_cache) { + struct sp_area *cache; + + cache = rb_entry(free_sp_area_cache, struct sp_area, rb_node); + if (spa->va_start <= cache->va_start) { + free_sp_area_cache = rb_prev(&spa->rb_node); + /* + * the new cache node may be changed to another region, + * i.e. from DVPP region to normal region + */ + if (free_sp_area_cache) { + cache = rb_entry(free_sp_area_cache, + struct sp_area, rb_node); + cached_vstart = cache->region_vstart; + } + /* + * We don't try to update cached_hole_size, + * but it won't go very wrong. + */ + } + } + + spa_dec_usage(spa); + if (spa->spg != spg_none) + list_del(&spa->link); + + rb_erase(&spa->rb_node, &sp_area_root); + RB_CLEAR_NODE(&spa->rb_node); + kfree(spa); +} + +static void __sp_area_drop_locked(struct sp_area *spa) +{ + /* + * Considering a situation where task A and B are in the same spg. + * A is exiting and calling remove_vma(). Before A calls this func, + * B calls sp_free() to free the same spa. So spa maybe NULL when A + * calls this func later. + */ + if (!spa) + return; + + if (atomic_dec_and_test(&spa->use_count)) + sp_free_area(spa); +} + +static void __sp_area_drop(struct sp_area *spa) +{ + spin_lock(&sp_area_lock); + __sp_area_drop_locked(spa); + spin_unlock(&sp_area_lock); +} + +void sp_area_drop(struct vm_area_struct *vma) +{ + struct sp_area *spa; + + if (!(vma->vm_flags & VM_SHARE_POOL)) + return; + + /* + * Considering a situation where task A and B are in the same spg. + * A is exiting and calling remove_vma() -> ... -> sp_area_drop(). + * Concurrently, B is calling sp_free() to free the same spa. + * __find_sp_area_locked() and __sp_area_drop_locked() should be + * an atomic operation. + */ + spin_lock(&sp_area_lock); + spa = __find_sp_area_locked(vma->vm_start); + __sp_area_drop_locked(spa); + spin_unlock(&sp_area_lock); +} + /** * sp_free() - Free the memory allocated by sp_alloc(). * @addr: the starting VA of the memory.
From: Wang Wensheng wangwensheng4@huawei.com
ascend inclusion category: Feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NDAW CVE: NA
-------------------
When the memory is insufficient or fragmentation is severe, the 2MB hugepage allocation will perform direct reclaim and compact.
The direct reclaim and compact may take a long time. As a result, sp mutex will be hold for too long time to casue the hung task problem. In this case, set the PF_MEMALLOC flag to prevent the direct reclaim and compact from being executed.
Direct compact is not allowed during hugepage allocation. As a result, 2MB hugepage may failed to be applied for.
When do sp alloc, if the 2MB hugepage cannot be allocated of the total free memory is less than 1/3 of total memory, a work is added compact the memory asynchronously.
When do sp free, if the total free memory is less than 1/3 of total memory, compact memory synchronously.
We can disable this and change the compact frequence through sysctl.
Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sysctl.c | 18 ++++++++++++++++ mm/share_pool.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8f417c7b12e8..c7073b652b0c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3269,6 +3269,24 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "sharepool_compact_enable", + .data = &sysctl_sp_compact_enable, + .maxlen = sizeof(sysctl_sp_compact_enable), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "sharepool_compact_interval", + .data = &sysctl_sp_compact_interval, + .maxlen = sizeof(sysctl_sp_compact_interval), + .mode = 0600, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &zero_ul, + .extra2 = &sysctl_sp_compact_interval_max, + }, #endif { } }; diff --git a/mm/share_pool.c b/mm/share_pool.c index 0ea113b904cf..1cd1a64f2a8c 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -1018,6 +1018,63 @@ void sp_area_drop(struct vm_area_struct *vma) spin_unlock(&sp_area_lock); }
+int sysctl_sp_compact_enable; +unsigned long sysctl_sp_compact_interval = 30UL; +unsigned long sysctl_sp_compact_interval_max = 1000UL; +static unsigned long compact_last_jiffies; +static unsigned long compact_daemon_status; +#define COMPACT_START 1 +#define COMPACT_STOP 0 + +static void sp_compact_nodes(struct work_struct *work) +{ + sysctl_compaction_handler(NULL, 1, NULL, NULL, NULL); + + kfree(work); + + compact_last_jiffies = jiffies; + cmpxchg(&compact_daemon_status, COMPACT_START, COMPACT_STOP); +} + +static void sp_add_work_compact(void) +{ + struct work_struct *compact_work; + + if (!sysctl_sp_compact_enable) + return; + + /* experimental compaction time: 4GB->1.7s, 8GB->3.4s */ + if (!time_after(jiffies, + compact_last_jiffies + sysctl_sp_compact_interval * HZ)) + return; + + if (cmpxchg(&compact_daemon_status, COMPACT_STOP, COMPACT_START) == + COMPACT_START) + return; + + compact_work = kzalloc(sizeof(*compact_work), GFP_KERNEL); + if (!compact_work) + return; + + INIT_WORK(compact_work, sp_compact_nodes); + schedule_work(compact_work); +} + +static void sp_try_to_compact(void) +{ + unsigned long totalram; + unsigned long freeram; + + totalram = totalram_pages(); + freeram = global_zone_page_state(NR_FREE_PAGES); + + /* free < total / 3 */ + if ((freeram + (freeram << 1)) > totalram) + return; + + sp_add_work_compact(); +} + /** * sp_free() - Free the memory allocated by sp_alloc(). * @addr: the starting VA of the memory.
From: Wang Wensheng wangwensheng4@huawei.com
ascend inclusion category: Feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NDAW CVE: NA
-------------------
This function could map a user memory to kernel vmalloc space.
Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Peng Wu wupeng58@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/share_pool.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 1cd1a64f2a8c..4d45a2519b10 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -1358,7 +1358,61 @@ static void __sp_walk_page_free(struct sp_walk_data *data) */ void *sp_make_share_u2k(unsigned long uva, unsigned long size, int pid) { - return NULL; + int ret = 0; + struct mm_struct *mm = current->mm; + void *p = ERR_PTR(-ESRCH); + struct sp_walk_data sp_walk_data = { + .page_count = 0, + }; + struct vm_struct *area; + + check_interrupt_context(); + + if (mm == NULL) { + pr_err("u2k: kthread is not allowed\n"); + return ERR_PTR(-EPERM); + } + + down_write(&mm->mmap_lock); + if (unlikely(mm->core_state)) { + up_write(&mm->mmap_lock); + pr_err("u2k: encountered coredump, abort\n"); + return p; + } + + ret = __sp_walk_page_range(uva, size, mm, &sp_walk_data); + if (ret) { + pr_err_ratelimited("walk page range failed %d\n", ret); + up_write(&mm->mmap_lock); + return ERR_PTR(ret); + } + + if (sp_walk_data.is_hugepage) + p = vmap_hugepage(sp_walk_data.pages, sp_walk_data.page_count, + VM_MAP, PAGE_KERNEL); + else + p = vmap(sp_walk_data.pages, sp_walk_data.page_count, VM_MAP, + PAGE_KERNEL); + up_write(&mm->mmap_lock); + + if (!p) { + pr_err("vmap(huge) in u2k failed\n"); + __sp_walk_page_free(&sp_walk_data); + return ERR_PTR(-ENOMEM); + } + + p = p + (uva - sp_walk_data.uva_aligned); + + /* + * kva p may be used later in k2u. Since p comes from uva originally, + * it's reasonable to add flag VM_USERMAP so that p can be remapped + * into userspace again. + */ + area = find_vm_area(p); + area->flags |= VM_USERMAP; + + kvfree(sp_walk_data.pages); + return p; } EXPORT_SYMBOL_GPL(sp_make_share_u2k);
From: Wang Wensheng wangwensheng4@huawei.com
ascend inclusion category: Feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NDAW CVE: NA
-------------------
This is reversed to sp_make_share_u2k that unmaps the vmalloc area from u2k.
Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Peng Wu wupeng58@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/share_pool.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 96 insertions(+), 1 deletion(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 4d45a2519b10..8e357056110e 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -1120,6 +1120,36 @@ void *mg_sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) } EXPORT_SYMBOL_GPL(mg_sp_alloc);
+/** + * is_vmap_hugepage() - Check if a kernel address belongs to vmalloc family. + * @addr: the kernel space address to be checked. + * + * Return: + * * >0 - a vmalloc hugepage addr. + * * =0 - a normal vmalloc addr. + * * -errno - failure. + */ +static int is_vmap_hugepage(unsigned long addr) +{ + struct vm_struct *area; + + if (unlikely(!addr)) { + pr_err_ratelimited("null vmap addr pointer\n"); + return -EINVAL; + } + + area = find_vm_area((void *)addr); + if (unlikely(!area)) { + pr_debug("can't find vm area(%lx)\n", addr); + return -EINVAL; + } + + if (area->flags & VM_HUGE_PAGES) + return 1; + else + return 0; +} + /** * sp_make_share_k2u() - Share kernel memory to current process or an sp_group. * @kva: the VA of shared kernel memory. @@ -1422,6 +1452,55 @@ void *mg_sp_make_share_u2k(unsigned long uva, unsigned long size, int pid) } EXPORT_SYMBOL_GPL(mg_sp_make_share_u2k);
+static int sp_unshare_uva(unsigned long uva, unsigned long size) +{ + return 0; +} + +/* No possible concurrent protection, take care when use */ +static int sp_unshare_kva(unsigned long kva, unsigned long size) +{ + unsigned long addr, kva_aligned; + struct page *page; + unsigned long size_aligned; + unsigned long step; + bool is_hugepage = true; + int ret; + + ret = is_vmap_hugepage(kva); + if (ret > 0) { + kva_aligned = ALIGN_DOWN(kva, PMD_SIZE); + size_aligned = ALIGN(kva + size, PMD_SIZE) - kva_aligned; + step = PMD_SIZE; + } else if (ret == 0) { + kva_aligned = ALIGN_DOWN(kva, PAGE_SIZE); + size_aligned = ALIGN(kva + size, PAGE_SIZE) - kva_aligned; + step = PAGE_SIZE; + is_hugepage = false; + } else { + pr_err_ratelimited("check vmap hugepage failed %d\n", ret); + return -EINVAL; + } + + if (kva_aligned + size_aligned < kva_aligned) { + pr_err_ratelimited("overflow happened in unshare kva\n"); + return -EINVAL; + } + + for (addr = kva_aligned; addr < (kva_aligned + size_aligned); addr += step) { + page = vmalloc_to_page((void *)addr); + if (page) + put_page(page); + else + WARN(1, "vmalloc %pK to page/hugepage failed\n", + (void *)addr); + } + + vunmap((void *)kva_aligned); + + return 0; +} + /** * sp_unshare() - Unshare the kernel or user memory which shared by calling * sp_make_share_{k2u,u2k}(). @@ -1434,7 +1513,23 @@ EXPORT_SYMBOL_GPL(mg_sp_make_share_u2k); */ int sp_unshare(unsigned long va, unsigned long size, int pid, int spg_id) { - return 0; + int ret = 0; + + check_interrupt_context(); + + if (va < TASK_SIZE) { + /* user address */ + ret = sp_unshare_uva(va, size); + } else if (va >= PAGE_OFFSET) { + /* kernel address */ + ret = sp_unshare_kva(va, size); + } else { + /* regard user and kernel address ranges as bad address */ + pr_debug("unshare addr %lx is not a user or kernel addr\n", (unsigned long)va); + ret = -EFAULT; + } + + return ret; } EXPORT_SYMBOL_GPL(sp_unshare);
From: Wang Wensheng wangwensheng4@huawei.com
ascend inclusion category: Feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NDAW CVE: NA
-------------------
This is used to find the group for a task. Also add spg management code.
Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/share_pool.c | 258 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 255 insertions(+), 3 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 8e357056110e..10f1aa496116 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -89,6 +89,8 @@ int sysctl_sp_debug_mode;
static int share_pool_group_mode = SINGLE_GROUP_MODE;
+static int system_group_count; + static unsigned int sp_device_number; static unsigned long sp_dev_va_start[MAX_DEVID]; static unsigned long sp_dev_va_size[MAX_DEVID]; @@ -630,6 +632,136 @@ static inline void check_interrupt_context(void) panic("function can't be used in interrupt context\n"); }
+static void free_sp_group_id(int spg_id) +{ + /* ida operation is protected by an internal spin_lock */ + if (spg_id >= SPG_ID_AUTO_MIN && spg_id <= SPG_ID_AUTO_MAX) + ida_free(&sp_group_id_ida, spg_id); +} + +static void free_sp_group(struct sp_group *spg) +{ + fput(spg->file); + fput(spg->file_hugetlb); + free_spg_stat(spg->id); + down_write(&sp_group_sem); + idr_remove(&sp_group_idr, spg->id); + up_write(&sp_group_sem); + free_sp_group_id((unsigned int)spg->id); + kfree(spg); + system_group_count--; + WARN(system_group_count < 0, "unexpected group count\n"); +} + +static void sp_group_drop(struct sp_group *spg) +{ + if (atomic_dec_and_test(&spg->use_count)) + free_sp_group(spg); +} + +/* use with put_task_struct(task) */ +static int get_task(int pid, struct task_struct **task) +{ + struct task_struct *tsk; + + rcu_read_lock(); + tsk = find_task_by_vpid(pid); + if (!tsk || (tsk->flags & PF_EXITING)) { + rcu_read_unlock(); + return -ESRCH; + } + get_task_struct(tsk); + rcu_read_unlock(); + + *task = tsk; + return 0; +} + +static struct sp_group *get_first_group(struct mm_struct *mm) +{ + struct sp_group *spg = NULL; + struct sp_group_master *master = mm->sp_group_master; + + if (master && master->count >= 1) { + struct sp_group_node *spg_node = NULL; + + spg_node = list_first_entry(&master->node_list, + struct sp_group_node, group_node); + spg = spg_node->spg; + + /* don't revive a dead group */ + if (!spg || !atomic_inc_not_zero(&spg->use_count)) + spg = NULL; + } + + return spg; +} + +/* + * the caller must: + * 1. hold spg->rw_lock + * 2. ensure no concurrency problem for mm_struct + */ +static struct sp_group_node *is_process_in_group(struct sp_group *spg, + struct mm_struct *mm) +{ + struct sp_group_node *spg_node; + + list_for_each_entry(spg_node, &spg->procs, proc_node) + if (spg_node->master->mm == mm) + return spg_node; + + return NULL; +} + +/* user must call sp_group_drop() after use */ +static struct sp_group *__sp_find_spg_locked(int pid, int spg_id) +{ + struct sp_group *spg = NULL; + struct task_struct *tsk = NULL; + int ret = 0; + + ret = get_task(pid, &tsk); + if (ret) + return NULL; + + if (spg_id == SPG_ID_DEFAULT) { + /* + * Once we encounter a concurrency problem here. + * To fix it, we believe get_task_mm() and mmput() is too + * heavy because we just get the pointer of sp_group. + */ + task_lock(tsk); + if (tsk->mm == NULL) + spg = NULL; + else + spg = get_first_group(tsk->mm); + task_unlock(tsk); + } else { + spg = idr_find(&sp_group_idr, spg_id); + /* don't revive a dead group */ + if (!spg || !atomic_inc_not_zero(&spg->use_count)) + goto fail; + } + + put_task_struct(tsk); + return spg; + +fail: + put_task_struct(tsk); + return NULL; +} + +static struct sp_group *__sp_find_spg(int pid, int spg_id) +{ + struct sp_group *spg; + + down_read(&sp_group_sem); + spg = __sp_find_spg_locked(pid, spg_id); + up_read(&sp_group_sem); + return spg; +} + /** * sp_group_id_by_pid() - Get the sp_group ID of a process. * @pid: pid of target process. @@ -640,7 +772,22 @@ static inline void check_interrupt_context(void) */ int sp_group_id_by_pid(int pid) { - return 0; + struct sp_group *spg; + int spg_id = -ENODEV; + + check_interrupt_context(); + + spg = __sp_find_spg(pid, SPG_ID_DEFAULT); + if (!spg) + return -ENODEV; + + down_read(&spg->rw_lock); + if (spg_valid(spg)) + spg_id = spg->id; + up_read(&spg->rw_lock); + + sp_group_drop(spg); + return spg_id; } EXPORT_SYMBOL_GPL(sp_group_id_by_pid);
@@ -658,7 +805,48 @@ EXPORT_SYMBOL_GPL(sp_group_id_by_pid); */ int mg_sp_group_id_by_pid(int pid, int *spg_ids, int *num) { - return 0; + int ret = 0; + struct sp_group_node *node; + struct sp_group_master *master = NULL; + struct task_struct *tsk; + + check_interrupt_context(); + + if (!spg_ids || num <= 0) + return -EINVAL; + + ret = get_task(pid, &tsk); + if (ret) + return ret; + + down_read(&sp_group_sem); + task_lock(tsk); + if (tsk->mm) + master = tsk->mm->sp_group_master; + task_unlock(tsk); + + if (!master) { + ret = -ENODEV; + goto out_up_read; + } + + if (!master->count) { + ret = -ENODEV; + goto out_up_read; + } + if ((unsigned int)*num < master->count) { + ret = -E2BIG; + goto out_up_read; + } + *num = master->count; + + list_for_each_entry(node, &master->node_list, group_node) + *(spg_ids++) = node->spg->id; + +out_up_read: + up_read(&sp_group_sem); + put_task_struct(tsk); + return ret; } EXPORT_SYMBOL_GPL(mg_sp_group_id_by_pid);
@@ -681,7 +869,71 @@ static bool is_device_addr(unsigned long addr)
static struct sp_group *create_spg(int spg_id) { - return NULL; + int ret; + struct sp_group *spg; + char name[20]; + struct user_struct *user = NULL; + int hsize_log = MAP_HUGE_2MB >> MAP_HUGE_SHIFT; + + if (unlikely(system_group_count + 1 == MAX_GROUP_FOR_SYSTEM)) { + pr_err_ratelimited("reach system max group num\n"); + return ERR_PTR(-ENOSPC); + } + + spg = kzalloc(sizeof(*spg), GFP_KERNEL); + if (spg == NULL) + return ERR_PTR(-ENOMEM); + + ret = idr_alloc(&sp_group_idr, spg, spg_id, spg_id + 1, GFP_KERNEL); + if (ret < 0) { + pr_err_ratelimited("group %d idr alloc failed %d\n", + spg_id, ret); + goto out_kfree; + } + + spg->id = spg_id; + spg->is_alive = true; + spg->proc_num = 0; + spg->owner = current->group_leader; + atomic_set(&spg->use_count, 1); + INIT_LIST_HEAD(&spg->procs); + INIT_LIST_HEAD(&spg->spa_list); + init_rwsem(&spg->rw_lock); + + sprintf(name, "sp_group_%d", spg_id); + spg->file = shmem_kernel_file_setup(name, MAX_LFS_FILESIZE, + VM_NORESERVE); + if (IS_ERR(spg->file)) { + pr_err("spg file setup failed %ld\n", PTR_ERR(spg->file)); + ret = PTR_ERR(spg->file); + goto out_idr; + } + + spg->file_hugetlb = hugetlb_file_setup(name, MAX_LFS_FILESIZE, + VM_NORESERVE, &user, HUGETLB_ANONHUGE_INODE, hsize_log); + if (IS_ERR(spg->file_hugetlb)) { + pr_err("spg file_hugetlb setup failed %ld\n", + PTR_ERR(spg->file_hugetlb)); + ret = PTR_ERR(spg->file_hugetlb); + goto out_fput; + } + + ret = sp_init_spg_stat(spg); + if (ret < 0) + goto out_fput_all; + + system_group_count++; + return spg; + +out_fput_all: + fput(spg->file_hugetlb); +out_fput: + fput(spg->file); +out_idr: + idr_remove(&sp_group_idr, spg_id); +out_kfree: + kfree(spg); + return ERR_PTR(ret); }
int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id)
From: Wang Wensheng wangwensheng4@huawei.com
ascend inclusion category: Feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NDAW CVE: NA
-------------------
Free the user share memory alloced by sp_alloc(). Note that this would unmap the memory for all the processes in the share pool group.
Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Peng Wu wupeng58@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/share_pool.c | 196 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 195 insertions(+), 1 deletion(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 10f1aa496116..bb134dd7ffc4 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -867,6 +867,22 @@ static bool is_device_addr(unsigned long addr) return false; }
+static loff_t addr_offset(struct sp_area *spa) +{ + unsigned long addr; + + if (unlikely(!spa)) { + WARN(1, "invalid spa when calculate addr offset\n"); + return 0; + } + addr = spa->va_start; + + if (!is_device_addr(addr)) + return (loff_t)(addr - MMAP_SHARE_POOL_START); + + return (loff_t)(addr - sp_dev_va_start[spa->device_id]); +} + static struct sp_group *create_spg(int spg_id) { int ret; @@ -1327,6 +1343,161 @@ static void sp_try_to_compact(void) sp_add_work_compact(); }
+/* + * The function calls of do_munmap() won't change any non-atomic member + * of struct sp_group. Please review the following chain: + * do_munmap -> remove_vma_list -> remove_vma -> sp_area_drop -> + * __sp_area_drop_locked -> sp_free_area + */ +static void sp_munmap(struct mm_struct *mm, unsigned long addr, + unsigned long size) +{ + int err; + + down_write(&mm->mmap_lock); + if (unlikely(mm->core_state)) { + up_write(&mm->mmap_lock); + pr_info("munmap: encoutered coredump\n"); + return; + } + + err = do_munmap(mm, addr, size, NULL); + /* we are not supposed to fail */ + if (err) + pr_err("failed to unmap VA %pK when sp munmap\n", (void *)addr); + + up_write(&mm->mmap_lock); +} + +static void __sp_free(struct sp_group *spg, unsigned long addr, + unsigned long size, struct mm_struct *stop) +{ + struct mm_struct *mm; + struct sp_group_node *spg_node = NULL; + + list_for_each_entry(spg_node, &spg->procs, proc_node) { + mm = spg_node->master->mm; + if (mm == stop) + break; + sp_munmap(mm, addr, size); + } +} + +/* Free the memory of the backing shmem or hugetlbfs */ +static void sp_fallocate(struct sp_area *spa) +{ + int ret; + unsigned long mode = FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE; + unsigned long offset = addr_offset(spa); + + ret = vfs_fallocate(spa_file(spa), mode, offset, spa_size(spa)); + if (ret) + WARN(1, "sp fallocate failed %d\n", ret); +} + +static void sp_free_unmap_fallocate(struct sp_area *spa) +{ + if (spa->spg != spg_none) { + down_read(&spa->spg->rw_lock); + __sp_free(spa->spg, spa->va_start, spa_size(spa), NULL); + sp_fallocate(spa); + up_read(&spa->spg->rw_lock); + } else { + sp_munmap(current->mm, spa->va_start, spa_size(spa)); + sp_fallocate(spa); + } +} + +static int sp_check_caller_permission(struct sp_group *spg, struct mm_struct *mm) +{ + int ret = 0; + + down_read(&spg->rw_lock); + if (!is_process_in_group(spg, mm)) + ret = -EPERM; + up_read(&spg->rw_lock); + return ret; +} + + +#define FREE_CONT 1 +#define FREE_END 2 + +struct sp_free_context { + unsigned long addr; + struct sp_area *spa; + int state; +}; + +/* when success, __sp_area_drop(spa) should be used */ +static int sp_free_get_spa(struct sp_free_context *fc) +{ + int ret = 0; + unsigned long addr = fc->addr; + struct sp_area *spa; + + fc->state = FREE_CONT; + + spa = __find_sp_area(addr); + if (!spa) { + pr_debug("sp free invalid input addr %lx\n", addr); + return -EINVAL; + } + + if (spa->type != SPA_TYPE_ALLOC) { + ret = -EINVAL; + pr_debug("sp free failed, %lx is not sp alloc addr\n", addr); + goto drop_spa; + } + fc->spa = spa; + + if (spa->spg != spg_none) { + /* + * Access control: an sp addr can only be freed by + * 1. another task in the same spg + * 2. a kthread + * + * a passthrough addr can only be freed by the applier process + */ + if (!current->mm) + goto check_spa; + + ret = sp_check_caller_permission(spa->spg, current->mm); + if (ret < 0) + goto drop_spa; + +check_spa: + down_write(&spa->spg->rw_lock); + if (!spg_valid(spa->spg)) { + fc->state = FREE_END; + up_write(&spa->spg->rw_lock); + goto drop_spa; + /* we must return success(0) in this situation */ + } + /* the life cycle of spa has a direct relation with sp group */ + if (unlikely(spa->is_dead)) { + up_write(&spa->spg->rw_lock); + pr_err_ratelimited("unexpected double sp free\n"); + dump_stack(); + ret = -EINVAL; + goto drop_spa; + } + spa->is_dead = true; + up_write(&spa->spg->rw_lock); + + } else { + if (current->tgid != spa->applier) { + ret = -EPERM; + goto drop_spa; + } + } + return 0; + +drop_spa: + __sp_area_drop(spa); + return ret; +} + /** * sp_free() - Free the memory allocated by sp_alloc(). * @addr: the starting VA of the memory. @@ -1338,7 +1509,30 @@ static void sp_try_to_compact(void) */ int sp_free(unsigned long addr) { - return 0; + int ret = 0; + struct sp_free_context fc = { + .addr = addr, + }; + + check_interrupt_context(); + + ret = sp_free_get_spa(&fc); + if (ret || fc.state == FREE_END) + goto out; + + sp_free_unmap_fallocate(fc.spa); + + /* current->mm == NULL: allow kthread */ + if (current->mm == NULL) + atomic64_sub(fc.spa->real_size, &kthread_stat.alloc_size); + else + sp_update_process_stat(current, false, fc.spa); + + __sp_area_drop(fc.spa); /* match __find_sp_area in sp_free_get_spa */ +out: + sp_dump_stack(); + sp_try_to_compact(); + return ret; } EXPORT_SYMBOL_GPL(sp_free);
From: Wang Wensheng wangwensheng4@huawei.com
ascend inclusion category: Feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NDAW CVE: NA
-------------------
This share a kernel memory range to userspace. Introduce vm_struct flag VM_SHAREPOOL to indicate that a vm_struct shared to userspace and we cannot vfree such a vm_area.
Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/share_pool.h | 8 + include/linux/vmalloc.h | 5 + kernel/sysctl.c | 19 ++ mm/hugetlb.c | 8 + mm/share_pool.c | 404 ++++++++++++++++++++++++++++++++++++- mm/vmalloc.c | 7 + 6 files changed, 450 insertions(+), 1 deletion(-)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 7e7ced34be57..6ec844708f83 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -288,6 +288,14 @@ static inline void sp_dump_stack(void) dump_stack(); }
+static inline bool is_vmalloc_sharepool(unsigned long vm_flags) +{ + if (sp_is_enabled() && (vm_flags & VM_SHAREPOOL)) + return true; + + return false; +} + #else /* CONFIG_ASCEND_SHARE_POOL */
static inline int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 8f9e13944cc7..49c94afce25b 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -28,6 +28,11 @@ struct notifier_block; /* in notifier.h */ #define VM_MAP_PUT_PAGES 0x00000200 /* put pages and free array in vfree */ #define VM_NO_HUGE_VMAP 0x00000400 /* force PAGE_SIZE pte mapping */ #define VM_HUGE_PAGES 0x00001000 /* used for vmalloc hugepages */ +#ifdef CONFIG_ASCEND_SHARE_POOL +#define VM_SHAREPOOL 0x00002000 /* remapped to sharepool */ +#else +#define VM_SHAREPOOL 0 +#endif
/* * VM_KASAN is used slighly differently depending on CONFIG_KASAN_VMALLOC. diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c7073b652b0c..9d242841ca3f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3287,6 +3287,25 @@ static struct ctl_table vm_table[] = { .extra1 = &zero_ul, .extra2 = &sysctl_sp_compact_interval_max, }, + { + /* 0: map_unlock, 1: map_lock */ + .procname = "share_pool_map_lock_enable", + .data = &sysctl_share_pool_map_lock_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "sharepool_perf_k2u", + .data = &sysctl_sp_perf_k2u, + .maxlen = sizeof(sysctl_sp_perf_k2u), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &ten_thousand, + }, #endif { } }; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index eaddb18d58e1..74d23542f9f2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -31,6 +31,7 @@ #include <linux/llist.h> #include <linux/cma.h> #include <linux/mman.h> +#include <linux/share_pool.h>
#include <asm/page.h> #include <asm/pgalloc.h> @@ -4110,6 +4111,13 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
pte = huge_ptep_get_and_clear(mm, address, ptep); tlb_remove_huge_tlb_entry(h, tlb, ptep, address); + + /* sharepool k2u mapped pages are marked special */ + if (sp_check_vm_share_pool(vma->vm_flags) && pte_special(pte)) { + spin_unlock(ptl); + continue; + } + if (huge_pte_dirty(pte)) set_page_dirty(page);
diff --git a/mm/share_pool.c b/mm/share_pool.c index bb134dd7ffc4..3c935d256b08 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -87,6 +87,10 @@ static int __read_mostly enable_share_k2u_spg = 1; /* debug mode */ int sysctl_sp_debug_mode;
+int sysctl_share_pool_map_lock_enable; + +int sysctl_sp_perf_k2u; + static int share_pool_group_mode = SINGLE_GROUP_MODE;
static int system_group_count; @@ -632,6 +636,13 @@ static inline void check_interrupt_context(void) panic("function can't be used in interrupt context\n"); }
+static unsigned long sp_mmap(struct mm_struct *mm, struct file *file, + struct sp_area *spa, unsigned long *populate, + unsigned long prot); +static void sp_munmap(struct mm_struct *mm, unsigned long addr, unsigned long size); +static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, + struct mm_struct *mm, unsigned long prot); + static void free_sp_group_id(int spg_id) { /* ida operation is protected by an internal spin_lock */ @@ -1206,6 +1217,19 @@ static struct sp_area *__find_sp_area(unsigned long addr) return n; }
+static bool vmalloc_area_clr_flag(unsigned long kva, unsigned long flags) +{ + struct vm_struct *area; + + area = find_vm_area((void *)kva); + if (area) { + area->flags &= ~flags; + return true; + } + + return false; +} + /* * Free the VA region starting from addr to the share pool */ @@ -1235,6 +1259,9 @@ static void sp_free_area(struct sp_area *spa) } }
+ if (spa->kva && !vmalloc_area_clr_flag(spa->kva, VM_SHAREPOOL)) + pr_debug("clear spa->kva %ld is not valid\n", spa->kva); + spa_dec_usage(spa); if (spa->spg != spg_none) list_del(&spa->link); @@ -1542,6 +1569,37 @@ int mg_sp_free(unsigned long addr) } EXPORT_SYMBOL_GPL(mg_sp_free);
+/* wrapper of __do_mmap() and the caller must hold down_write(&mm->mmap_lock). */ +static unsigned long sp_mmap(struct mm_struct *mm, struct file *file, + struct sp_area *spa, unsigned long *populate, + unsigned long prot) +{ + unsigned long addr = spa->va_start; + unsigned long size = spa_size(spa); + unsigned long flags = MAP_FIXED | MAP_SHARED | MAP_POPULATE | + MAP_SHARE_POOL; + unsigned long vm_flags = VM_NORESERVE | VM_SHARE_POOL | VM_DONTCOPY; + unsigned long pgoff = addr_offset(spa) >> PAGE_SHIFT; + + /* Mark the mapped region to be locked. After the MAP_LOCKED is enable, + * multiple tasks will preempt resources, causing performance loss. + */ + if (sysctl_share_pool_map_lock_enable) + flags |= MAP_LOCKED; + + atomic_inc(&spa->use_count); + addr = __do_mmap_mm(mm, file, addr, size, prot, flags, vm_flags, pgoff, + populate, NULL); + if (IS_ERR_VALUE(addr)) { + atomic_dec(&spa->use_count); + pr_err("do_mmap fails %ld\n", addr); + } else { + BUG_ON(addr != spa->va_start); + } + + return addr; +} + /** * sp_alloc() - Allocate shared memory for all the processes in a sp_group. * @size: the size of memory to allocate. @@ -1596,6 +1654,314 @@ static int is_vmap_hugepage(unsigned long addr) return 0; }
+static unsigned long __sp_remap_get_pfn(unsigned long kva) +{ + unsigned long pfn; + + if (is_vmalloc_addr((void *)kva)) + pfn = vmalloc_to_pfn((void *)kva); + else + pfn = virt_to_pfn(kva); + + return pfn; +} + +/* when called by k2u to group, always make sure rw_lock of spg is down */ +static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, + struct mm_struct *mm, unsigned long prot) +{ + struct vm_area_struct *vma; + unsigned long ret_addr; + unsigned long populate = 0; + int ret = 0; + unsigned long addr, buf, offset; + + down_write(&mm->mmap_lock); + if (unlikely(mm->core_state)) { + pr_err("k2u mmap: encountered coredump, abort\n"); + ret_addr = -EBUSY; + goto put_mm; + } + + ret_addr = sp_mmap(mm, spa_file(spa), spa, &populate, prot); + if (IS_ERR_VALUE(ret_addr)) { + pr_debug("k2u mmap failed %lx\n", ret_addr); + goto put_mm; + } + BUG_ON(ret_addr != spa->va_start); + + vma = find_vma(mm, ret_addr); + BUG_ON(vma == NULL); + if (prot & PROT_WRITE) + vma->vm_page_prot = __pgprot(((~PTE_RDONLY) & vma->vm_page_prot.pgprot) | PTE_DIRTY); + + if (is_vm_hugetlb_page(vma)) { + ret = remap_vmalloc_hugepage_range(vma, (void *)kva, 0); + if (ret) { + do_munmap(mm, ret_addr, spa_size(spa), NULL); + pr_debug("remap vmalloc hugepage failed, ret %d, kva is %lx\n", + ret, (unsigned long)kva); + ret_addr = ret; + goto put_mm; + } + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + } else { + buf = ret_addr; + addr = kva; + offset = 0; + do { + ret = remap_pfn_range(vma, buf, __sp_remap_get_pfn(addr), PAGE_SIZE, + __pgprot(vma->vm_page_prot.pgprot)); + if (ret) { + do_munmap(mm, ret_addr, spa_size(spa), NULL); + pr_err("remap_pfn_range failed %d\n", ret); + ret_addr = ret; + goto put_mm; + } + offset += PAGE_SIZE; + buf += PAGE_SIZE; + addr += PAGE_SIZE; + } while (offset < spa_size(spa)); + } + +put_mm: + up_write(&mm->mmap_lock); + + return ret_addr; +} + +/** + * sp_make_share_kva_to_task() - Share kernel memory to current task. + * @kva: the VA of shared kernel memory + * @size: the size of area to share, should be aligned properly + * @sp_flags: the flags for the opreation + * + * Return: + * * if succeed, return the shared user address to start at. + * * if fail, return the pointer of -errno. + */ +static void *sp_make_share_kva_to_task(unsigned long kva, unsigned long size, unsigned long sp_flags) +{ + void *uva; + struct sp_area *spa; + struct spg_proc_stat *stat; + unsigned long prot = PROT_READ | PROT_WRITE; + + down_write(&sp_group_sem); + stat = sp_init_process_stat(current, current->mm, spg_none); + up_write(&sp_group_sem); + if (IS_ERR(stat)) { + pr_err_ratelimited("k2u_task init process stat failed %lx\n", + PTR_ERR(stat)); + return stat; + } + + spa = sp_alloc_area(size, sp_flags, spg_none, SPA_TYPE_K2TASK, current->tgid); + if (IS_ERR(spa)) { + pr_err_ratelimited("alloc spa failed in k2u_task (potential no enough virtual memory when -75): %ld\n", + PTR_ERR(spa)); + return spa; + } + + spa->kva = kva; + + uva = (void *)sp_remap_kva_to_vma(kva, spa, current->mm, prot); + __sp_area_drop(spa); + if (IS_ERR(uva)) + pr_err("remap k2u to task failed %ld\n", PTR_ERR(uva)); + else { + update_spg_proc_stat(size, true, stat, SPA_TYPE_K2TASK); + spa->mm = current->mm; + } + + return uva; +} + +/** + * Share kernel memory to a spg, the current process must be in that group + * @kva: the VA of shared kernel memory + * @size: the size of area to share, should be aligned properly + * @sp_flags: the flags for the opreation + * @spg: the sp group to be shared with + * + * Return: the shared user address to start at + */ +static void *sp_make_share_kva_to_spg(unsigned long kva, unsigned long size, + unsigned long sp_flags, struct sp_group *spg) +{ + struct sp_area *spa; + struct mm_struct *mm; + struct sp_group_node *spg_node; + void *uva = ERR_PTR(-ENODEV); + + down_read(&spg->rw_lock); + spa = sp_alloc_area(size, sp_flags, spg, SPA_TYPE_K2SPG, current->tgid); + if (IS_ERR(spa)) { + up_read(&spg->rw_lock); + pr_err_ratelimited("alloc spa failed in k2u_spg (potential no enough virtual memory when -75): %ld\n", + PTR_ERR(spa)); + return spa; + } + + spa->kva = kva; + + list_for_each_entry(spg_node, &spg->procs, proc_node) { + mm = spg_node->master->mm; + uva = (void *)sp_remap_kva_to_vma(kva, spa, mm, spg_node->prot); + if (IS_ERR(uva)) { + pr_err("remap k2u to spg failed %ld\n", PTR_ERR(uva)); + __sp_free(spg, spa->va_start, spa_size(spa), mm); + goto out; + } + } + +out: + up_read(&spg->rw_lock); + __sp_area_drop(spa); + if (!IS_ERR(uva)) + sp_update_process_stat(current, true, spa); + + return uva; +} + +static bool vmalloc_area_set_flag(unsigned long kva, unsigned long flags) +{ + struct vm_struct *area; + + area = find_vm_area((void *)kva); + if (area) { + area->flags |= flags; + return true; + } + + return false; +} + +struct sp_k2u_context { + unsigned long kva; + unsigned long kva_aligned; + unsigned long size; + unsigned long size_aligned; + unsigned long sp_flags; + int spg_id; + bool to_task; + struct timespec64 start; + struct timespec64 end; +}; + +static void trace_sp_k2u_begin(struct sp_k2u_context *kc) +{ + if (!sysctl_sp_perf_k2u) + return; + + ktime_get_ts64(&kc->start); +} + +static void trace_sp_k2u_finish(struct sp_k2u_context *kc, void *uva) +{ + unsigned long cost; + + if (!sysctl_sp_perf_k2u) + return; + + ktime_get_ts64(&kc->end); + + cost = SEC2US(kc->end.tv_sec - kc->start.tv_sec) + + NS2US(kc->end.tv_nsec - kc->start.tv_nsec); + if (cost >= (unsigned long)sysctl_sp_perf_k2u) { + pr_err("Task %s(%d/%d) sp_k2u returns 0x%lx consumes %luus, size is %luKB, size_aligned is %luKB, sp_flags is %lx, to_task is %d\n", + current->comm, current->tgid, current->pid, + (unsigned long)uva, cost, byte2kb(kc->size), byte2kb(kc->size_aligned), + kc->sp_flags, kc->to_task); + } +} + +static int sp_k2u_prepare(unsigned long kva, unsigned long size, + unsigned long sp_flags, int spg_id, struct sp_k2u_context *kc) +{ + int is_hugepage; + unsigned int page_size = PAGE_SIZE; + unsigned long kva_aligned, size_aligned; + + trace_sp_k2u_begin(kc); + + if (sp_flags & ~SP_DVPP) { + pr_err_ratelimited("k2u sp_flags %lx error\n", sp_flags); + return -EINVAL; + } + + if (!current->mm) { + pr_err_ratelimited("k2u: kthread is not allowed\n"); + return -EPERM; + } + + is_hugepage = is_vmap_hugepage(kva); + if (is_hugepage > 0) { + sp_flags |= SP_HUGEPAGE; + page_size = PMD_SIZE; + } else if (is_hugepage == 0) { + /* do nothing */ + } else { + pr_err_ratelimited("k2u kva is not vmalloc address\n"); + return is_hugepage; + } + + /* aligned down kva is convenient for caller to start with any valid kva */ + kva_aligned = ALIGN_DOWN(kva, page_size); + size_aligned = ALIGN(kva + size, page_size) - kva_aligned; + + if (!vmalloc_area_set_flag(kva_aligned, VM_SHAREPOOL)) { + pr_debug("k2u_task kva %lx is not valid\n", kva_aligned); + return -EINVAL; + } + + kc->kva = kva; + kc->kva_aligned = kva_aligned; + kc->size = size; + kc->size_aligned = size_aligned; + kc->sp_flags = sp_flags; + kc->spg_id = spg_id; + kc->to_task = false; + return 0; +} + +static int sp_check_k2task(struct sp_k2u_context *kc) +{ + int ret = 0; + int spg_id = kc->spg_id; + + if (share_pool_group_mode == SINGLE_GROUP_MODE) { + struct sp_group *spg = get_first_group(current->mm); + + if (!spg) { + if (spg_id != SPG_ID_NONE && spg_id != SPG_ID_DEFAULT) + ret = -EINVAL; + else + kc->to_task = true; + } else { + if (spg_id != SPG_ID_DEFAULT && spg_id != spg->id) + ret = -EINVAL; + sp_group_drop(spg); + } + } else { + if (spg_id == SPG_ID_DEFAULT || spg_id == SPG_ID_NONE) + kc->to_task = true; + } + return ret; +} + +static void *sp_k2u_finish(void *uva, struct sp_k2u_context *kc) +{ + if (IS_ERR(uva)) + vmalloc_area_clr_flag(kc->kva_aligned, VM_SHAREPOOL); + else + uva = uva + (kc->kva - kc->kva_aligned); + + trace_sp_k2u_finish(kc, uva); + sp_dump_stack(); + return uva; +} + /** * sp_make_share_k2u() - Share kernel memory to current process or an sp_group. * @kva: the VA of shared kernel memory. @@ -1616,7 +1982,43 @@ static int is_vmap_hugepage(unsigned long addr) void *sp_make_share_k2u(unsigned long kva, unsigned long size, unsigned long sp_flags, int pid, int spg_id) { - return NULL; + void *uva; + int ret; + struct sp_k2u_context kc; + + check_interrupt_context(); + + ret = sp_k2u_prepare(kva, size, sp_flags, spg_id, &kc); + if (ret) + return ERR_PTR(ret); + + ret = sp_check_k2task(&kc); + if (ret) { + uva = ERR_PTR(ret); + goto out; + } + + if (kc.to_task) + uva = sp_make_share_kva_to_task(kc.kva_aligned, kc.size_aligned, kc.sp_flags); + else { + struct sp_group *spg; + + spg = __sp_find_spg(current->pid, kc.spg_id); + if (spg) { + ret = sp_check_caller_permission(spg, current->mm); + if (ret < 0) { + sp_group_drop(spg); + uva = ERR_PTR(ret); + goto out; + } + uva = sp_make_share_kva_to_spg(kc.kva_aligned, kc.size_aligned, kc.sp_flags, spg); + sp_group_drop(spg); + } else + uva = ERR_PTR(-ENODEV); + } + +out: + return sp_k2u_finish(uva, &kc); } EXPORT_SYMBOL_GPL(sp_make_share_k2u);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 755be0c19c81..dadbea29241d 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -37,6 +37,7 @@ #include <linux/pgtable.h> #include <linux/uaccess.h> #include <linux/hugetlb.h> +#include <linux/share_pool.h> #include <asm/io.h> #include <asm/tlbflush.h> #include <asm/shmparam.h> @@ -2622,6 +2623,12 @@ static void __vunmap(const void *addr, int deallocate_pages) return; }
+ /* unmap a sharepool vm area will cause meamleak! */ + if (is_vmalloc_sharepool(area->flags)) { + WARN(1, "Memory leak due to vfree() sharepool vm area (%p) !\n", addr); + return; + } + debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
From: Wang Wensheng wangwensheng4@huawei.com
ascend inclusion category: Feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NDAW CVE: NA
-------------------
This is reversed to sp_make_share_k2u().
Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Peng Wu wupeng58@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/share_pool.c | 163 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 162 insertions(+), 1 deletion(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 3c935d256b08..355c429b3843 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -2300,9 +2300,170 @@ void *mg_sp_make_share_u2k(unsigned long uva, unsigned long size, int pid) } EXPORT_SYMBOL_GPL(mg_sp_make_share_u2k);
+/* + * Input parameters uva, pid and spg_id are now useless. spg_id will be useful + * when supporting a process in multiple sp groups. + * + * Procedure of unshare uva must be compatible with: + * + * 1. DVPP channel destroy procedure: + * do_exit() -> exit_mm() (mm no longer in spg and current->mm == NULL) -> + * exit_task_work() -> task_work_run() -> __fput() -> ... -> vdec_close() -> + * sp_unshare(uva, SPG_ID_DEFAULT) + * + * 2. Process A once was the target of k2u(to group), then it exits. + * Guard worker kthread tries to free this uva and it must succeed, otherwise + * spa of this uva leaks. + * + * This also means we must trust DVPP channel destroy and guard worker code. + */ static int sp_unshare_uva(unsigned long uva, unsigned long size) { - return 0; + int ret = 0; + struct mm_struct *mm; + struct sp_area *spa; + unsigned long uva_aligned; + unsigned long size_aligned; + unsigned int page_size; + + /* + * at first we guess it's a hugepage addr + * we can tolerate at most PMD_SIZE or PAGE_SIZE which is matched in k2u + */ + spa = __find_sp_area(ALIGN_DOWN(uva, PMD_SIZE)); + if (!spa) { + spa = __find_sp_area(ALIGN_DOWN(uva, PAGE_SIZE)); + if (!spa) { + ret = -EINVAL; + pr_debug("invalid input uva %lx in unshare uva\n", (unsigned long)uva); + goto out; + } + } + + if (spa->type != SPA_TYPE_K2TASK && spa->type != SPA_TYPE_K2SPG) { + pr_err_ratelimited("unshare wrong type spa\n"); + ret = -EINVAL; + goto out_drop_area; + } + /* + * 1. overflow actually won't happen due to an spa must be valid. + * 2. we must unshare [spa->va_start, spa->va_start + spa->real_size) completely + * because an spa is one-to-one correspondence with an vma. + * Thus input parameter size is not necessarily needed. + */ + page_size = (spa->is_hugepage ? PMD_SIZE : PAGE_SIZE); + uva_aligned = spa->va_start; + size_aligned = spa->real_size; + + if (size_aligned < ALIGN(size, page_size)) { + ret = -EINVAL; + pr_err_ratelimited("unshare uva failed, invalid parameter size %lu\n", size); + goto out_drop_area; + } + + if (spa->type == SPA_TYPE_K2TASK) { + if (spa->applier != current->tgid) { + pr_err_ratelimited("unshare uva(to task) no permission\n"); + ret = -EPERM; + goto out_drop_area; + } + + if (!spa->mm) { + pr_err_ratelimited("unshare uva(to task) failed, none spa owner\n"); + ret = -EINVAL; + goto out_drop_area; + } + + /* + * current thread may be exiting in a multithread process + * + * 1. never need a kthread to make unshare when process has exited + * 2. in dvpp channel destroy procedure, exit_mm() has been called + * and don't need to make unshare + */ + mm = get_task_mm(current->group_leader); + if (!mm) { + pr_info_ratelimited("no need to unshare uva(to task), target process mm is exiting\n"); + goto out_clr_flag; + } + + if (spa->mm != mm) { + pr_err_ratelimited("unshare uva(to task) failed, spa not belong to the task\n"); + ret = -EINVAL; + mmput(mm); + goto out_drop_area; + } + + down_write(&mm->mmap_lock); + if (unlikely(mm->core_state)) { + ret = 0; + up_write(&mm->mmap_lock); + mmput(mm); + goto out_drop_area; + } + + ret = do_munmap(mm, uva_aligned, size_aligned, NULL); + up_write(&mm->mmap_lock); + mmput(mm); + /* we are not supposed to fail */ + if (ret) + pr_err("failed to unmap VA %pK when munmap in unshare uva\n", + (void *)uva_aligned); + sp_update_process_stat(current, false, spa); + + } else if (spa->type == SPA_TYPE_K2SPG) { + down_read(&spa->spg->rw_lock); + /* always allow kthread and dvpp channel destroy procedure */ + if (current->mm) { + if (!is_process_in_group(spa->spg, current->mm)) { + up_read(&spa->spg->rw_lock); + pr_err_ratelimited("unshare uva(to group) failed, caller process doesn't belong to target group\n"); + ret = -EPERM; + goto out_drop_area; + } + } + up_read(&spa->spg->rw_lock); + + down_write(&spa->spg->rw_lock); + if (!spg_valid(spa->spg)) { + up_write(&spa->spg->rw_lock); + pr_info_ratelimited("share pool: no need to unshare uva(to group), sp group of spa is dead\n"); + goto out_clr_flag; + } + /* the life cycle of spa has a direct relation with sp group */ + if (unlikely(spa->is_dead)) { + up_write(&spa->spg->rw_lock); + pr_err_ratelimited("unexpected double sp unshare\n"); + dump_stack(); + ret = -EINVAL; + goto out_drop_area; + } + spa->is_dead = true; + up_write(&spa->spg->rw_lock); + + down_read(&spa->spg->rw_lock); + __sp_free(spa->spg, uva_aligned, size_aligned, NULL); + up_read(&spa->spg->rw_lock); + + if (current->mm == NULL) + atomic64_sub(spa->real_size, &kthread_stat.k2u_size); + else + sp_update_process_stat(current, false, spa); + } else { + WARN(1, "unshare uva invalid spa type"); + } + + sp_dump_stack(); + +out_clr_flag: + if (!vmalloc_area_clr_flag(spa->kva, VM_SHAREPOOL)) + pr_debug("clear spa->kva %ld is not valid\n", spa->kva); + spa->kva = 0; + +out_drop_area: + __sp_area_drop(spa); +out: + return ret; }
/* No possible concurrent protection, take care when use */
From: Wang Wensheng wangwensheng4@huawei.com
ascend inclusion category: Feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NDAW CVE: NA
-------------------
Alloc share memory for tasks in a share pool group. The tasks in the same pool could access the memory with the same virtual address.
Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Peng Wu wupeng58@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sysctl.c | 9 ++ mm/share_pool.c | 397 +++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 405 insertions(+), 1 deletion(-)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9d242841ca3f..97019c861d73 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3306,6 +3306,15 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &ten_thousand, }, + { + .procname = "sharepool_perf_alloc", + .data = &sysctl_sp_perf_alloc, + .maxlen = sizeof(sysctl_sp_perf_alloc), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &ten_thousand, + }, #endif { } }; diff --git a/mm/share_pool.c b/mm/share_pool.c index 355c429b3843..d9f015e22abe 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -90,6 +90,7 @@ int sysctl_sp_debug_mode; int sysctl_share_pool_map_lock_enable;
int sysctl_sp_perf_k2u; +int sysctl_sp_perf_alloc;
static int share_pool_group_mode = SINGLE_GROUP_MODE;
@@ -1600,6 +1601,373 @@ static unsigned long sp_mmap(struct mm_struct *mm, struct file *file, return addr; }
+#define ALLOC_NORMAL 1 +#define ALLOC_RETRY 2 +#define ALLOC_NOMEM 3 + +struct sp_alloc_context { + struct sp_group *spg; + struct file *file; + unsigned long size; + unsigned long size_aligned; + unsigned long sp_flags; + unsigned long populate; + int state; + bool need_fallocate; + struct timespec64 start; + struct timespec64 end; +}; + +static void trace_sp_alloc_begin(struct sp_alloc_context *ac) +{ + if (!sysctl_sp_perf_alloc) + return; + + ktime_get_ts64(&ac->start); +} + +static void trace_sp_alloc_finish(struct sp_alloc_context *ac, unsigned long va) +{ + unsigned long cost; + bool is_pass_through = ac->spg == spg_none ? true : false; + + if (!sysctl_sp_perf_alloc) + return; + + ktime_get_ts64(&ac->end); + + cost = SEC2US(ac->end.tv_sec - ac->start.tv_sec) + + NS2US(ac->end.tv_nsec - ac->start.tv_nsec); + if (cost >= (unsigned long)sysctl_sp_perf_alloc) { + pr_err("Task %s(%d/%d) sp_alloc returns 0x%lx consumes %luus, size is %luKB, size_aligned is %luKB, sp_flags is %lx, pass through is %d\n", + current->comm, current->tgid, current->pid, + va, cost, byte2kb(ac->size), byte2kb(ac->size_aligned), ac->sp_flags, is_pass_through); + } +} + +static int sp_alloc_prepare(unsigned long size, unsigned long sp_flags, + int spg_id, struct sp_alloc_context *ac) +{ + struct sp_group *spg; + + check_interrupt_context(); + + trace_sp_alloc_begin(ac); + + /* mdc scene hack */ + if (enable_mdc_default_group) + spg_id = mdc_default_group_id; + + if (unlikely(!size || (size >> PAGE_SHIFT) > totalram_pages())) { + pr_err_ratelimited("allocation failed, invalid size %lu\n", size); + return -EINVAL; + } + + if (spg_id != SPG_ID_DEFAULT && spg_id < SPG_ID_MIN) { + pr_err_ratelimited("allocation failed, invalid group id %d\n", spg_id); + return -EINVAL; + } + + if (sp_flags & (~SP_FLAG_MASK)) { + pr_err_ratelimited("allocation failed, invalid flag %lx\n", sp_flags); + return -EINVAL; + } + + if (sp_flags & SP_HUGEPAGE_ONLY) + sp_flags |= SP_HUGEPAGE; + + if (share_pool_group_mode == SINGLE_GROUP_MODE) { + spg = __sp_find_spg(current->pid, SPG_ID_DEFAULT); + if (spg) { + if (spg_id != SPG_ID_DEFAULT && spg->id != spg_id) { + sp_group_drop(spg); + return -ENODEV; + } + + /* up_read will be at the end of sp_alloc */ + down_read(&spg->rw_lock); + if (!spg_valid(spg)) { + up_read(&spg->rw_lock); + sp_group_drop(spg); + pr_err_ratelimited("allocation failed, spg is dead\n"); + return -ENODEV; + } + } else { /* alocation pass through scene */ + if (enable_mdc_default_group) { + int ret = 0; + + ret = sp_group_add_task(current->tgid, spg_id); + if (ret < 0) { + pr_err_ratelimited("add group failed in pass through\n"); + return ret; + } + + spg = __sp_find_spg(current->pid, SPG_ID_DEFAULT); + + /* up_read will be at the end of sp_alloc */ + down_read(&spg->rw_lock); + if (!spg_valid(spg)) { + up_read(&spg->rw_lock); + sp_group_drop(spg); + pr_err_ratelimited("pass through allocation failed, spg is dead\n"); + return -ENODEV; + } + } else { + spg = spg_none; + } + } + } else { + if (spg_id != SPG_ID_DEFAULT) { + spg = __sp_find_spg(current->pid, spg_id); + if (!spg) { + pr_err_ratelimited("allocation failed, can't find group\n"); + return -ENODEV; + } + + /* up_read will be at the end of sp_alloc */ + down_read(&spg->rw_lock); + if (!spg_valid(spg)) { + up_read(&spg->rw_lock); + sp_group_drop(spg); + pr_err_ratelimited("allocation failed, spg is dead\n"); + return -ENODEV; + } + + if (!is_process_in_group(spg, current->mm)) { + up_read(&spg->rw_lock); + sp_group_drop(spg); + pr_err_ratelimited("allocation failed, task not in group\n"); + return -ENODEV; + } + } else { /* alocation pass through scene */ + spg = spg_none; + } + } + + if (sp_flags & SP_HUGEPAGE) { + ac->file = spg->file_hugetlb; + ac->size_aligned = ALIGN(size, PMD_SIZE); + } else { + ac->file = spg->file; + ac->size_aligned = ALIGN(size, PAGE_SIZE); + } + + ac->spg = spg; + ac->size = size; + ac->sp_flags = sp_flags; + ac->state = ALLOC_NORMAL; + ac->need_fallocate = false; + return 0; +} + +static void sp_alloc_unmap(struct mm_struct *mm, struct sp_area *spa, + struct sp_group_node *spg_node) +{ + if (spa->spg != spg_none) + __sp_free(spa->spg, spa->va_start, spa->real_size, mm); +} + +static int sp_alloc_mmap(struct mm_struct *mm, struct sp_area *spa, + struct sp_group_node *spg_node, struct sp_alloc_context *ac) +{ + int ret = 0; + unsigned long mmap_addr; + /* pass through default permission */ + unsigned long prot = PROT_READ | PROT_WRITE; + unsigned long sp_addr = spa->va_start; + unsigned long populate = 0; + struct vm_area_struct *vma; + + down_write(&mm->mmap_lock); + if (unlikely(mm->core_state)) { + up_write(&mm->mmap_lock); + sp_alloc_unmap(mm, spa, spg_node); + ac->state = ALLOC_NOMEM; + pr_info("allocation encountered coredump\n"); + return -EFAULT; + } + + if (spg_node) + prot = spg_node->prot; + + /* when success, mmap_addr == spa->va_start */ + mmap_addr = sp_mmap(mm, spa_file(spa), spa, &populate, prot); + if (IS_ERR_VALUE(mmap_addr)) { + up_write(&mm->mmap_lock); + sp_alloc_unmap(mm, spa, spg_node); + pr_err("sp mmap in allocation failed %ld\n", mmap_addr); + return PTR_ERR((void *)mmap_addr); + } + + if (unlikely(populate == 0)) { + up_write(&mm->mmap_lock); + pr_err("allocation sp mmap populate failed\n"); + ret = -EFAULT; + goto unmap; + } + ac->populate = populate; + + vma = find_vma(mm, sp_addr); + if (unlikely(!vma)) { + up_write(&mm->mmap_lock); + WARN(1, "allocation failed, can't find %lx vma\n", sp_addr); + ret = -EINVAL; + goto unmap; + } + /* clean PTE_RDONLY flags or trigger SMMU event */ + if (prot & PROT_WRITE) + vma->vm_page_prot = __pgprot(((~PTE_RDONLY) & vma->vm_page_prot.pgprot) | PTE_DIRTY); + up_write(&mm->mmap_lock); + + return ret; + +unmap: + if (spa->spg != spg_none) + sp_alloc_unmap(list_next_entry(spg_node, proc_node)->master->mm, spa, spg_node); + else + sp_munmap(mm, spa->va_start, spa->real_size); + return ret; +} + +static void sp_alloc_fallback(struct sp_area *spa, struct sp_alloc_context *ac) +{ + struct sp_spg_stat *stat = ac->spg->stat; + + if (ac->file == ac->spg->file) { + ac->state = ALLOC_NOMEM; + return; + } + + atomic_inc(&stat->hugepage_failures); + if (!(ac->sp_flags & SP_HUGEPAGE_ONLY)) { + ac->file = ac->spg->file; + ac->size_aligned = ALIGN(ac->size, PAGE_SIZE); + ac->sp_flags &= ~SP_HUGEPAGE; + ac->state = ALLOC_RETRY; + __sp_area_drop(spa); + return; + } + ac->state = ALLOC_NOMEM; +} + +static int sp_alloc_populate(struct mm_struct *mm, struct sp_area *spa, + struct sp_group_node *spg_node, struct sp_alloc_context *ac) +{ + int ret = 0; + unsigned long sp_addr = spa->va_start; + unsigned int noreclaim_flag = 0; + + /* + * The direct reclaim and compact may take a long + * time. As a result, sp mutex will be hold for too + * long time to casue the hung task problem. In this + * case, set the PF_MEMALLOC flag to prevent the + * direct reclaim and compact from being executed. + * Since direct reclaim and compact are not performed + * when the fragmentation is severe or the memory is + * insufficient, 2MB continuous physical pages fail + * to be allocated. This situation is allowed. + */ + if (spa->is_hugepage) + noreclaim_flag = memalloc_noreclaim_save(); + + /* + * We are not ignoring errors, so if we fail to allocate + * physical memory we just return failure, so we won't encounter + * page fault later on, and more importantly sp_make_share_u2k() + * depends on this feature (and MAP_LOCKED) to work correctly. + */ + ret = do_mm_populate(mm, sp_addr, ac->populate, 0); + if (spa->is_hugepage) { + memalloc_noreclaim_restore(noreclaim_flag); + if (ret) + sp_add_work_compact(); + } + if (ret) { + if (spa->spg != spg_none) + sp_alloc_unmap(list_next_entry(spg_node, proc_node)->master->mm, spa, spg_node); + else + sp_munmap(mm, spa->va_start, spa->real_size); + + if (unlikely(fatal_signal_pending(current))) + pr_warn_ratelimited("allocation failed, current thread is killed\n"); + else + pr_warn_ratelimited("allocation failed due to mm populate failed(potential no enough memory when -12): %d\n", + ret); + sp_fallocate(spa); /* need this, otherwise memleak */ + sp_alloc_fallback(spa, ac); + } else { + ac->need_fallocate = true; + } + return ret; +} + +static int __sp_alloc_mmap_populate(struct mm_struct *mm, struct sp_area *spa, + struct sp_group_node *spg_node, struct sp_alloc_context *ac) +{ + int ret; + + ret = sp_alloc_mmap(mm, spa, spg_node, ac); + if (ret < 0) { + if (ac->need_fallocate) { + /* e.g. second sp_mmap fail */ + sp_fallocate(spa); + ac->need_fallocate = false; + } + return ret; + } + + ret = sp_alloc_populate(mm, spa, spg_node, ac); + return ret; +} + +static int sp_alloc_mmap_populate(struct sp_area *spa, + struct sp_alloc_context *ac) +{ + int ret; + struct mm_struct *mm; + struct sp_group_node *spg_node; + + if (spa->spg == spg_none) { + ret = __sp_alloc_mmap_populate(current->mm, spa, NULL, ac); + } else { + /* create mapping for each process in the group */ + list_for_each_entry(spg_node, &spa->spg->procs, proc_node) { + mm = spg_node->master->mm; + ret = __sp_alloc_mmap_populate(mm, spa, spg_node, ac); + if (ret) + return ret; + } + } + return ret; +} + +/* spa maybe an error pointer, so introduce variable spg */ +static void sp_alloc_finish(int result, struct sp_area *spa, + struct sp_alloc_context *ac) +{ + struct sp_group *spg = ac->spg; + bool is_pass_through = spg == spg_none ? true : false; + + /* match sp_alloc_check_prepare */ + if (!is_pass_through) + up_read(&spg->rw_lock); + + if (!result) + sp_update_process_stat(current, true, spa); + + /* this will free spa if mmap failed */ + if (spa && !IS_ERR(spa)) + __sp_area_drop(spa); + + if (!is_pass_through) + sp_group_drop(spg); + + trace_sp_alloc_finish(ac, spa->va_start); + sp_dump_stack(); + sp_try_to_compact(); +} + /** * sp_alloc() - Allocate shared memory for all the processes in a sp_group. * @size: the size of memory to allocate. @@ -1614,7 +1982,34 @@ static unsigned long sp_mmap(struct mm_struct *mm, struct file *file, */ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) { - return NULL; + struct sp_area *spa = NULL; + int ret = 0; + struct sp_alloc_context ac; + + ret = sp_alloc_prepare(size, sp_flags, spg_id, &ac); + if (ret) + return ERR_PTR(ret); + +try_again: + spa = sp_alloc_area(ac.size_aligned, ac.sp_flags, ac.spg, + SPA_TYPE_ALLOC, current->tgid); + if (IS_ERR(spa)) { + pr_err_ratelimited("alloc spa failed in allocation(potential no enough virtual memory when -75): %ld\n", + PTR_ERR(spa)); + ret = PTR_ERR(spa); + goto out; + } + + ret = sp_alloc_mmap_populate(spa, &ac); + if (ret && ac.state == ALLOC_RETRY) + goto try_again; + +out: + sp_alloc_finish(ret, spa, &ac); + if (ret) + return ERR_PTR(ret); + else + return (void *)(spa->va_start); } EXPORT_SYMBOL_GPL(sp_alloc);
From: Wang Wensheng wangwensheng4@huawei.com
ascend inclusion category: Feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NDAW CVE: NA
-------------------
Add tasks to share pool group. The share memory regions alloced before are mapped to the new joiner too.
Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Peng Wu wupeng58@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sysctl.c | 9 ++ mm/share_pool.c | 411 +++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 418 insertions(+), 2 deletions(-)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 97019c861d73..8c290fcfa32f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3315,6 +3315,15 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &ten_thousand, }, + { + .procname = "sharepool_ac_mode", + .data = &sysctl_ac_mode, + .maxlen = sizeof(sysctl_ac_mode), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, #endif { } }; diff --git a/mm/share_pool.c b/mm/share_pool.c index d9f015e22abe..6d64086e0cd9 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -84,6 +84,8 @@ static const int mdc_default_group_id = 1; /* share the uva to the whole group */ static int __read_mostly enable_share_k2u_spg = 1;
+/* access control mode */ +int sysctl_ac_mode = AC_NONE; /* debug mode */ int sysctl_sp_debug_mode;
@@ -637,6 +639,14 @@ static inline void check_interrupt_context(void) panic("function can't be used in interrupt context\n"); }
+static inline bool check_aoscore_process(struct task_struct *tsk) +{ + if (tsk->flags & PF_DOMAIN_CORE) + return true; + else + return false; +} + static unsigned long sp_mmap(struct mm_struct *mm, struct file *file, struct sp_area *spa, unsigned long *populate, unsigned long prot); @@ -651,6 +661,12 @@ static void free_sp_group_id(int spg_id) ida_free(&sp_group_id_ida, spg_id); }
+static void free_new_spg_id(bool new, int spg_id) +{ + if (new) + free_sp_group_id(spg_id); +} + static void free_sp_group(struct sp_group *spg) { fput(spg->file); @@ -964,15 +980,406 @@ static struct sp_group *create_spg(int spg_id) return ERR_PTR(ret); }
-int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id) +/* the caller must hold sp_group_sem */ +static struct sp_group *find_or_alloc_sp_group(int spg_id) { + struct sp_group *spg; + + spg = __sp_find_spg_locked(current->pid, spg_id); + + if (!spg) { + spg = create_spg(spg_id); + } else { + down_read(&spg->rw_lock); + if (!spg_valid(spg)) { + up_read(&spg->rw_lock); + sp_group_drop(spg); + return ERR_PTR(-ENODEV); + } + up_read(&spg->rw_lock); + /* spg->use_count has increased due to __sp_find_spg() */ + } + + return spg; +} + +static void __sp_area_drop_locked(struct sp_area *spa); + +/* The caller must down_write(&mm->mmap_lock) */ +static void sp_munmap_task_areas(struct mm_struct *mm, struct sp_group *spg, struct list_head *stop) +{ + struct sp_area *spa, *prev = NULL; + int err; + + + spin_lock(&sp_area_lock); + list_for_each_entry(spa, &spg->spa_list, link) { + if (&spa->link == stop) + break; + + __sp_area_drop_locked(prev); + prev = spa; + + atomic_inc(&spa->use_count); + spin_unlock(&sp_area_lock); + + err = do_munmap(mm, spa->va_start, spa_size(spa), NULL); + if (err) { + /* we are not supposed to fail */ + pr_err("failed to unmap VA %pK when munmap task areas\n", + (void *)spa->va_start); + } + + spin_lock(&sp_area_lock); + } + __sp_area_drop_locked(prev); + + spin_unlock(&sp_area_lock); +} + +/* the caller must hold sp_group_sem */ +static int mm_add_group_init(struct mm_struct *mm, struct sp_group *spg) +{ + struct sp_group_master *master = mm->sp_group_master; + bool exist = false; + + if (share_pool_group_mode == SINGLE_GROUP_MODE && master && + master->count == 1) { + pr_err_ratelimited("at most one sp group for a task is allowed in single mode\n"); + return -EEXIST; + } + + master = sp_init_group_master_locked(mm, &exist); + if (IS_ERR(master)) + return PTR_ERR(master); + + if (!exist) + return 0; + + if (is_process_in_group(spg, mm)) { + pr_err_ratelimited("task already in target group, id=%d\n", spg->id); + return -EEXIST; + } + + if (master->count + 1 == MAX_GROUP_FOR_TASK) { + pr_err("task reaches max group num\n"); + return -ENOSPC; + } + + return 0; +} + +/* the caller must hold sp_group_sem */ +static struct sp_group_node *create_spg_node(struct mm_struct *mm, + unsigned long prot, struct sp_group *spg) +{ + struct sp_group_master *master = mm->sp_group_master; + struct sp_group_node *spg_node; + + spg_node = kzalloc(sizeof(struct sp_group_node), GFP_KERNEL); + if (spg_node == NULL) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&spg_node->group_node); + INIT_LIST_HEAD(&spg_node->proc_node); + spg_node->spg = spg; + spg_node->master = master; + spg_node->prot = prot; + + list_add_tail(&spg_node->group_node, &master->node_list); + master->count++; + + return spg_node; +} + +/* the caller must down_write(&spg->rw_lock) */ +static int insert_spg_node(struct sp_group *spg, struct sp_group_node *node) +{ + if (spg->proc_num + 1 == MAX_PROC_PER_GROUP) { + pr_err_ratelimited("add group: group reaches max process num\n"); + return -ENOSPC; + } + + spg->proc_num++; + list_add_tail(&node->proc_node, &spg->procs); return 0; } + +/* the caller must down_write(&spg->rw_lock) */ +static void delete_spg_node(struct sp_group *spg, struct sp_group_node *node) +{ + list_del(&node->proc_node); + spg->proc_num--; +} + +/* the caller must hold sp_group_sem */ +static void free_spg_node(struct mm_struct *mm, struct sp_group *spg, + struct sp_group_node *spg_node) +{ + struct sp_group_master *master = mm->sp_group_master; + + list_del(&spg_node->group_node); + master->count--; + + kfree(spg_node); +} + +/** + * sp_group_add_task() - Add a process to an share group (sp_group). + * @pid: the pid of the task to be added. + * @prot: the prot of task for this spg. + * @spg_id: the ID of the sp_group. + * + * A process can't be added to more than one sp_group in single group mode + * and can in multiple group mode. + * + * Return: A postive group number for success, -errno on failure. + * + * The manually specified ID is between [SPG_ID_MIN, SPG_ID_MAX]. + * The automatically allocated ID is between [SPG_ID_AUTO_MIN, SPG_ID_AUTO_MAX]. + * When negative, the return value is -errno. + */ +int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id) +{ + struct task_struct *tsk; + struct mm_struct *mm; + struct sp_group *spg; + struct sp_group_node *node = NULL; + int ret = 0; + bool id_newly_generated = false; + struct sp_area *spa, *prev = NULL; + struct spg_proc_stat *stat; + + check_interrupt_context(); + + /* only allow READ, READ | WRITE */ + if (!((prot == PROT_READ) + || (prot == (PROT_READ | PROT_WRITE)))) { + pr_err_ratelimited("prot is invalid 0x%lx\n", prot); + return -EINVAL; + } + + /* mdc scene hack */ + if (enable_mdc_default_group) + spg_id = mdc_default_group_id; + + if (spg_id < SPG_ID_MIN || spg_id > SPG_ID_AUTO) { + pr_err_ratelimited("add group failed, invalid group id %d\n", spg_id); + return -EINVAL; + } + + if (spg_id >= SPG_ID_AUTO_MIN && spg_id <= SPG_ID_AUTO_MAX) { + spg = __sp_find_spg(pid, spg_id); + + if (!spg) { + pr_err_ratelimited("spg %d hasn't been created\n", spg_id); + return -EINVAL; + } + + down_read(&spg->rw_lock); + if (!spg_valid(spg)) { + up_read(&spg->rw_lock); + pr_err_ratelimited("add group failed, group id %d is dead\n", spg_id); + sp_group_drop(spg); + return -EINVAL; + } + up_read(&spg->rw_lock); + + sp_group_drop(spg); + } + + if (spg_id == SPG_ID_AUTO) { + spg_id = ida_alloc_range(&sp_group_id_ida, SPG_ID_AUTO_MIN, + SPG_ID_AUTO_MAX, GFP_ATOMIC); + if (spg_id < 0) { + pr_err_ratelimited("add group failed, auto generate group id failed\n"); + return spg_id; + } + id_newly_generated = true; + } + + down_write(&sp_group_sem); + + ret = get_task(pid, &tsk); + if (ret) { + up_write(&sp_group_sem); + free_new_spg_id(id_newly_generated, spg_id); + goto out; + } + + if (check_aoscore_process(tsk)) { + up_write(&sp_group_sem); + ret = -EACCES; + free_new_spg_id(id_newly_generated, spg_id); + sp_dump_stack(); + goto out_put_task; + } + + /* + * group_leader: current thread may be exiting in a multithread process + * + * DESIGN IDEA + * We increase mm->mm_users deliberately to ensure it's decreased in + * share pool under only 2 circumstances, which will simply the overall + * design as mm won't be freed unexpectedly. + * + * The corresponding refcount decrements are as follows: + * 1. the error handling branch of THIS function. + * 2. In sp_group_exit(). It's called only when process is exiting. + */ + mm = get_task_mm(tsk->group_leader); + if (!mm) { + up_write(&sp_group_sem); + ret = -ESRCH; + free_new_spg_id(id_newly_generated, spg_id); + goto out_put_task; + } + + spg = find_or_alloc_sp_group(spg_id); + if (IS_ERR(spg)) { + up_write(&sp_group_sem); + ret = PTR_ERR(spg); + free_new_spg_id(id_newly_generated, spg_id); + goto out_put_mm; + } + + /* access control permission check */ + if (sysctl_ac_mode == AC_SINGLE_OWNER) { + if (spg->owner != current->group_leader) { + ret = -EPERM; + goto out_drop_group; + } + } + + ret = mm_add_group_init(mm, spg); + if (ret) + goto out_drop_group; + + node = create_spg_node(mm, prot, spg); + if (unlikely(IS_ERR(node))) { + ret = PTR_ERR(node); + goto out_drop_spg_node; + } + + /* per process statistics initialization */ + stat = sp_init_process_stat(tsk, mm, spg); + if (IS_ERR(stat)) { + ret = PTR_ERR(stat); + pr_err_ratelimited("init process stat failed %lx\n", PTR_ERR(stat)); + goto out_drop_spg_node; + } + + down_write(&spg->rw_lock); + ret = insert_spg_node(spg, node); + if (unlikely(ret)) { + up_write(&spg->rw_lock); + goto out_drop_spg_node; + } + + /* + * create mappings of existing shared memory segments into this + * new process' page table. + */ + spin_lock(&sp_area_lock); + + list_for_each_entry(spa, &spg->spa_list, link) { + unsigned long populate = 0; + struct file *file = spa_file(spa); + unsigned long addr; + + __sp_area_drop_locked(prev); + prev = spa; + + atomic_inc(&spa->use_count); + + if (spa->is_dead == true) + continue; + + spin_unlock(&sp_area_lock); + + if (spa->type == SPA_TYPE_K2SPG && spa->kva) { + addr = sp_remap_kva_to_vma(spa->kva, spa, mm, prot); + if (IS_ERR_VALUE(addr)) + pr_warn("add group remap k2u failed %ld\n", addr); + + spin_lock(&sp_area_lock); + continue; + } + + down_write(&mm->mmap_lock); + if (unlikely(mm->core_state)) { + sp_munmap_task_areas(mm, spg, &spa->link); + up_write(&mm->mmap_lock); + ret = -EBUSY; + pr_err("add group: encountered coredump, abort\n"); + spin_lock(&sp_area_lock); + break; + } + + addr = sp_mmap(mm, file, spa, &populate, prot); + if (IS_ERR_VALUE(addr)) { + sp_munmap_task_areas(mm, spg, &spa->link); + up_write(&mm->mmap_lock); + ret = addr; + pr_err("add group: sp mmap failed %d\n", ret); + spin_lock(&sp_area_lock); + break; + } + up_write(&mm->mmap_lock); + + if (populate) { + ret = do_mm_populate(mm, spa->va_start, populate, 0); + if (ret) { + if (unlikely(fatal_signal_pending(current))) + pr_warn_ratelimited("add group failed, current thread is killed\n"); + else + pr_warn_ratelimited("add group failed, mm populate failed (potential no enough memory when -12): %d, spa type is %d\n", + ret, spa->type); + down_write(&mm->mmap_lock); + sp_munmap_task_areas(mm, spg, spa->link.next); + up_write(&mm->mmap_lock); + spin_lock(&sp_area_lock); + break; + } + } + + spin_lock(&sp_area_lock); + } + __sp_area_drop_locked(prev); + spin_unlock(&sp_area_lock); + + if (unlikely(ret)) + delete_spg_node(spg, node); + up_write(&spg->rw_lock); + +out_drop_spg_node: + if (unlikely(ret)) + free_spg_node(mm, spg, node); + /* + * to simplify design, we don't release the resource of + * group_master and proc_stat, they will be freed when + * process is exiting. + */ +out_drop_group: + if (unlikely(ret)) { + up_write(&sp_group_sem); + sp_group_drop(spg); + } else + up_write(&sp_group_sem); +out_put_mm: + /* No need to put the mm if the sp group adds this mm successfully */ + if (unlikely(ret)) + mmput(mm); +out_put_task: + put_task_struct(tsk); +out: + return ret == 0 ? spg_id : ret; +} EXPORT_SYMBOL_GPL(mg_sp_group_add_task);
int sp_group_add_task(int pid, int spg_id) { - return 0; + return mg_sp_group_add_task(pid, PROT_READ | PROT_WRITE, spg_id); } EXPORT_SYMBOL_GPL(sp_group_add_task);
From: Wang Wensheng wangwensheng4@huawei.com
ascend inclusion category: Feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NDAW CVE: NA
-------------------
This interface is added to support the function of exiting a process from a sharing group.
Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Signed-off-by: Guo Mengqi guomengqi3@huawei.com Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/share_pool.c | 121 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 119 insertions(+), 2 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 6d64086e0cd9..8dc64232f0db 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -156,6 +156,21 @@ static struct sp_group_master *sp_init_group_master_locked( return master; }
+static struct sp_proc_stat *sp_get_proc_stat(struct mm_struct *mm) +{ + struct sp_proc_stat *stat; + + if (!mm->sp_group_master) + return NULL; + + down_read(&sp_proc_stat_sem); + stat = mm->sp_group_master->stat; + up_read(&sp_proc_stat_sem); + + /* maybe NULL or not, we always return it */ + return stat; +} + static struct sp_proc_stat *create_proc_stat(struct mm_struct *mm, struct task_struct *tsk) { @@ -1383,7 +1398,26 @@ int sp_group_add_task(int pid, int spg_id) } EXPORT_SYMBOL_GPL(sp_group_add_task);
-static void __sp_area_drop_locked(struct sp_area *spa); +static void free_spg_proc_stat(struct mm_struct *mm, int spg_id) +{ + int i; + struct sp_proc_stat *proc_stat = sp_get_proc_stat(mm); + struct spg_proc_stat *stat; + struct sp_spg_stat *spg_stat; + struct hlist_node *tmp; + + hash_for_each_safe(proc_stat->hash, i, tmp, stat, pnode) { + if (stat->spg_id == spg_id) { + spg_stat = stat->spg_stat; + mutex_lock(&spg_stat->lock); + hash_del(&stat->gnode); + mutex_unlock(&spg_stat->lock); + hash_del(&stat->pnode); + kfree(stat); + break; + } + } +}
/** * mg_sp_group_del_task() - delete a process from a sp group. @@ -1399,7 +1433,90 @@ static void __sp_area_drop_locked(struct sp_area *spa); */ int mg_sp_group_del_task(int pid, int spg_id) { - return 0; + int ret = 0; + struct sp_group *spg; + struct sp_group_node *spg_node; + struct task_struct *tsk = NULL; + struct mm_struct *mm = NULL; + bool is_alive = true; + + if (spg_id < SPG_ID_MIN || spg_id > SPG_ID_AUTO) { + pr_err_ratelimited("del from group failed, invalid group id %d\n", spg_id); + return -EINVAL; + } + + spg = __sp_find_spg(pid, spg_id); + if (!spg) { + pr_err_ratelimited("spg not found or get task failed."); + return -EINVAL; + } + down_write(&sp_group_sem); + + if (!spg_valid(spg)) { + up_write(&sp_group_sem); + pr_err_ratelimited("spg dead."); + ret = -EINVAL; + goto out; + } + + if (!list_empty(&spg->spa_list)) { + up_write(&sp_group_sem); + pr_err_ratelimited("spa is not empty"); + ret = -EINVAL; + goto out; + } + + ret = get_task(pid, &tsk); + if (ret) { + up_write(&sp_group_sem); + pr_err_ratelimited("task is not found"); + goto out; + } + mm = get_task_mm(tsk->group_leader); + if (!mm) { + up_write(&sp_group_sem); + pr_err_ratelimited("mm is not found"); + ret = -ESRCH; + goto out_put_task; + } + + spg_node = is_process_in_group(spg, mm); + if (!spg_node) { + up_write(&sp_group_sem); + pr_err_ratelimited("process not in group"); + ret = -ESRCH; + goto out_put_mm; + } + + down_write(&spg->rw_lock); + if (list_is_singular(&spg->procs)) + is_alive = spg->is_alive = false; + spg->proc_num--; + list_del(&spg_node->proc_node); + sp_group_drop(spg); + up_write(&spg->rw_lock); + if (!is_alive) + blocking_notifier_call_chain(&sp_notifier_chain, 0, spg); + + list_del(&spg_node->group_node); + mm->sp_group_master->count--; + kfree(spg_node); + if (atomic_sub_and_test(1, &mm->mm_users)) { + up_write(&sp_group_sem); + WARN(1, "Invalid user counting\n"); + return -EINVAL; + } + + free_spg_proc_stat(mm, spg_id); + up_write(&sp_group_sem); + +out_put_mm: + mmput(mm); +out_put_task: + put_task_struct(tsk); +out: + sp_group_drop(spg); /* if spg dead, freed here */ + return ret; } EXPORT_SYMBOL_GPL(mg_sp_group_del_task);
From: Wang Wensheng wangwensheng4@huawei.com
ascend inclusion category: Feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NDAW CVE: NA
-------------------
The fork() will create the new mm for new process, the mm should not take any information from the parent process, so need to clean it.
The exit() will mmput the mm and free the memory, if the mm is alrready be used for sp_group, need to clean the group first.
Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Peng Wu wupeng58@huawei.com Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/share_pool.h | 2 + kernel/fork.c | 7 ++ mm/mmap.c | 5 ++ mm/share_pool.c | 128 +++++++++++++++++++++++++++++++++++++ 4 files changed, 142 insertions(+)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 6ec844708f83..ac637359e158 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -258,6 +258,8 @@ extern int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id); extern int sp_group_add_task(int pid, int spg_id);
extern void sp_area_drop(struct vm_area_struct *vma); +extern int sp_group_exit(struct mm_struct *mm); +extern void sp_group_post_exit(struct mm_struct *mm);
static inline bool sp_is_enabled(void) { diff --git a/kernel/fork.c b/kernel/fork.c index 454b42af1de8..bf27ee90ad23 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -98,6 +98,7 @@ #include <linux/io_uring.h> #include <linux/share_pool.h>
+#include <linux/share_pool.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> @@ -1092,6 +1093,9 @@ static inline void __mmput(struct mm_struct *mm) ksm_exit(mm); khugepaged_exit(mm); /* must run before exit_mmap */ exit_mmap(mm); + + sp_group_post_exit(mm); + mm_put_huge_zero_page(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { @@ -1111,6 +1115,9 @@ void mmput(struct mm_struct *mm) { might_sleep();
+ if (sp_group_exit(mm)) + return; + if (atomic_dec_and_test(&mm->mm_users)) __mmput(mm); } diff --git a/mm/mmap.c b/mm/mmap.c index d5a97a56dca7..c616e99e7672 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -183,6 +183,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) if (vma->vm_file) fput(vma->vm_file); mpol_put(vma_policy(vma)); + sp_area_drop(vma); vm_area_free(vma); return next; } @@ -1174,6 +1175,10 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, if (vm_flags & VM_SPECIAL) return NULL;
+ /* don't merge this kind of vma as sp_area couldn't be merged */ + if (sp_check_vm_share_pool(vm_flags)) + return NULL; + next = vma_next(mm, prev); area = next; if (area && area->vm_end == end) /* cases 6, 7, 8 */ diff --git a/mm/share_pool.c b/mm/share_pool.c index 8dc64232f0db..96fc899617a5 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -4139,6 +4139,134 @@ static void __init proc_sharepool_init(void)
/*** End of tatistical and maintenance functions ***/
+#define MM_WOULD_FREE 1 + +/* + * Recall we add mm->users by 1 deliberately in sp_group_add_task(). + * If the mm_users == sp_group_master->count + 1, it means that the mm is ready + * to be freed because the last owner of this mm is in exiting procedure: + * do_exit() -> exit_mm() -> mmput() -> sp_group_exit -> THIS function. + */ +static bool need_free_sp_group(struct mm_struct *mm, + struct sp_group_master *master) +{ + /* thread exits but process is still alive */ + if ((unsigned int)atomic_read(&mm->mm_users) != master->count + MM_WOULD_FREE) { + if (atomic_dec_and_test(&mm->mm_users)) + WARN(1, "Invalid user counting\n"); + return false; + } + + return true; +} + +/* + * Return: + * 1 - let mmput() return immediately + * 0 - let mmput() decrease mm_users and try __mmput() + */ +int sp_group_exit(struct mm_struct *mm) +{ + struct sp_group *spg; + struct sp_group_master *master; + struct sp_group_node *spg_node, *tmp; + bool is_alive = true; + + if (!sp_is_enabled()) + return 0; + + down_write(&sp_group_sem); + + master = mm->sp_group_master; + if (!master) { + up_write(&sp_group_sem); + return 0; + } + + if (!need_free_sp_group(mm, master)) { + up_write(&sp_group_sem); + return 1; + } + + list_for_each_entry_safe(spg_node, tmp, &master->node_list, group_node) { + spg = spg_node->spg; + + down_write(&spg->rw_lock); + /* a dead group should NOT be reactive again */ + if (spg_valid(spg) && list_is_singular(&spg->procs)) + is_alive = spg->is_alive = false; + spg->proc_num--; + list_del(&spg_node->proc_node); + up_write(&spg->rw_lock); + + if (!is_alive) + blocking_notifier_call_chain(&sp_notifier_chain, 0, + spg); + } + + /* match with get_task_mm() in sp_group_add_task() */ + if (atomic_sub_and_test(master->count, &mm->mm_users)) { + up_write(&sp_group_sem); + WARN(1, "Invalid user counting\n"); + return 1; + } + + up_write(&sp_group_sem); + return 0; +} + +void sp_group_post_exit(struct mm_struct *mm) +{ + struct sp_proc_stat *stat; + long alloc_size, k2u_size; + /* lockless visit */ + struct sp_group_master *master = mm->sp_group_master; + struct sp_group_node *spg_node, *tmp; + struct sp_group *spg; + + if (!sp_is_enabled() || !master) + return; + + /* + * There are two basic scenarios when a process in the share pool is + * exiting but its share pool memory usage is not 0. + * 1. Process A called sp_alloc(), but it terminates without calling + * sp_free(). Then its share pool memory usage is a positive number. + * 2. Process A never called sp_alloc(), and process B in the same spg + * called sp_alloc() to get an addr u. Then A gets u somehow and + * called sp_free(u). Now A's share pool memory usage is a negative + * number. Notice B's memory usage will be a positive number. + * + * We decide to print an info when seeing both of the scenarios. + * + * A process not in an sp group doesn't need to print because there + * wont't be any memory which is not freed. + */ + stat = sp_get_proc_stat(mm); + if (stat) { + alloc_size = atomic64_read(&stat->alloc_size); + k2u_size = atomic64_read(&stat->k2u_size); + + if (alloc_size != 0 || k2u_size != 0) + pr_info("process %s(%d) exits. It applied %ld aligned KB, k2u shared %ld aligned KB\n", + stat->comm, stat->tgid, + byte2kb(alloc_size), byte2kb(k2u_size)); + + /* match with sp_init_proc_stat, we expect stat is released after this call */ + sp_proc_stat_drop(stat); + } + + /* lockless traverse */ + list_for_each_entry_safe(spg_node, tmp, &master->node_list, group_node) { + spg = spg_node->spg; + /* match with refcount inc in sp_group_add_task */ + sp_group_drop(spg); + kfree(spg_node); + } + + kfree(master); +} + DEFINE_STATIC_KEY_FALSE(share_pool_enabled_key);
static int __init enable_share_pool(char *s)
From: Wang Wensheng wangwensheng4@huawei.com
ascend inclusion category: Feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NDAW CVE: NA
-------------------
The situation below is not allowed:
int *result = mmap(ADDR, sizeof(int), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
As share pool uses an independent UVA allocation algorithm, it may produce an address that is conflicted with user-specified address.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/share_pool.h | 2 ++ mm/mmap.c | 9 +++++++++ mm/mremap.c | 4 ++++ mm/share_pool.c | 20 ++++++++++++++++++++ 4 files changed, 35 insertions(+)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index ac637359e158..0254ea95f034 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -260,6 +260,8 @@ extern int sp_group_add_task(int pid, int spg_id); extern void sp_area_drop(struct vm_area_struct *vma); extern int sp_group_exit(struct mm_struct *mm); extern void sp_group_post_exit(struct mm_struct *mm); +extern bool sp_check_addr(unsigned long addr); +extern bool sp_check_mmap_addr(unsigned long addr, unsigned long flags);
static inline bool sp_is_enabled(void) { diff --git a/mm/mmap.c b/mm/mmap.c index c616e99e7672..36ef4c2b93a9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2430,6 +2430,9 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, if (len > mmap_end - mmap_min_addr) return -ENOMEM;
+ if (sp_check_mmap_addr(addr, flags)) + return -EINVAL; + if (flags & MAP_FIXED) return addr;
@@ -2481,6 +2484,9 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, if (len > mmap_end - mmap_min_addr) return -ENOMEM;
+ if (sp_check_mmap_addr(addr, flags)) + return -EINVAL; + if (flags & MAP_FIXED) return addr;
@@ -3209,6 +3215,9 @@ static int __vm_munmap(unsigned long start, size_t len, bool downgrade) struct mm_struct *mm = current->mm; LIST_HEAD(uf);
+ if (sp_check_addr(start)) + return -EINVAL; + if (mmap_write_lock_killable(mm)) return -EINTR;
diff --git a/mm/mremap.c b/mm/mremap.c index d4d66d1b6ec7..ecfca97b97ae 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -24,6 +24,7 @@ #include <linux/uaccess.h> #include <linux/mm-arch-hooks.h> #include <linux/userfaultfd_k.h> +#include <linux/share_pool.h>
#include <asm/cacheflush.h> #include <asm/tlb.h> @@ -929,6 +930,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, if (offset_in_page(addr)) return ret;
+ if (sp_check_addr(addr) || sp_check_addr(new_addr)) + return ret; + old_len = PAGE_ALIGN(old_len); new_len = PAGE_ALIGN(new_len);
diff --git a/mm/share_pool.c b/mm/share_pool.c index 96fc899617a5..7853054ed005 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -4139,6 +4139,26 @@ static void __init proc_sharepool_init(void)
/*** End of tatistical and maintenance functions ***/
+bool sp_check_addr(unsigned long addr) +{ + if (sp_is_enabled() && is_sharepool_addr(addr) && + !check_aoscore_process(current)) { + sp_dump_stack(); + return true; + } else + return false; +} + +bool sp_check_mmap_addr(unsigned long addr, unsigned long flags) +{ + if (sp_is_enabled() && is_sharepool_addr(addr) && + !check_aoscore_process(current) && !(flags & MAP_SHARE_POOL)) { + sp_dump_stack(); + return true; + } else + return false; +} + #define MM_WOULD_FREE 1
/*
From: Wang Wensheng wangwensheng4@huawei.com
ascend inclusion category: Feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NDAW CVE: NA
-------------------
We store the preferred node_id in sp_area in sp_alloc() and use it for memory alloc in shmem_fault.
Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Signed-off-by: Peng Wu wupeng58@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/share_pool.h | 1 + mm/share_pool.c | 20 +++++++++++++++++++- mm/shmem.c | 5 +++-- 3 files changed, 23 insertions(+), 3 deletions(-)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 0254ea95f034..ca9fcde90211 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -262,6 +262,7 @@ extern int sp_group_exit(struct mm_struct *mm); extern void sp_group_post_exit(struct mm_struct *mm); extern bool sp_check_addr(unsigned long addr); extern bool sp_check_mmap_addr(unsigned long addr, unsigned long flags); +extern int sp_node_id(struct vm_area_struct *vma);
static inline bool sp_is_enabled(void) { diff --git a/mm/share_pool.c b/mm/share_pool.c index 7853054ed005..05218c6dd250 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -1971,7 +1971,6 @@ static int sp_check_caller_permission(struct sp_group *spg, struct mm_struct *mm return ret; }
- #define FREE_CONT 1 #define FREE_END 2
@@ -3616,6 +3615,25 @@ bool mg_is_sharepool_addr(unsigned long addr) } EXPORT_SYMBOL_GPL(mg_is_sharepool_addr);
+int sp_node_id(struct vm_area_struct *vma) +{ + struct sp_area *spa; + int node_id = numa_node_id(); + + if (!sp_is_enabled()) + return node_id; + + if (vma) { + spa = __find_sp_area(vma->vm_start); + if (spa) { + node_id = spa->node_id; + __sp_area_drop(spa); + } + } + + return node_id; +} + static int __init mdc_default_group(char *s) { enable_mdc_default_group = 1; diff --git a/mm/shmem.c b/mm/shmem.c index 51f8f3b75803..60644e9b1a7b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -80,6 +80,7 @@ static struct vfsmount *shm_mnt; #include <linux/userfaultfd_k.h> #include <linux/rmap.h> #include <linux/uuid.h> +#include <linux/share_pool.h>
#include <linux/uaccess.h>
@@ -1889,11 +1890,11 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, }
alloc_huge: - page = shmem_alloc_and_acct_page(gfp, inode, index, true, numa_node_id()); + page = shmem_alloc_and_acct_page(gfp, inode, index, true, sp_node_id(vma)); if (IS_ERR(page)) { alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode, - index, false, numa_node_id()); + index, false, sp_node_id(vma)); } if (IS_ERR(page)) { int retry = 5;
From: Wang Wensheng wangwensheng4@huawei.com
ascend inclusion category: Feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NDAW CVE: NA
-------------------
In the share pool scenario, when the shared memory is applied for, the do_mm_populate function is performed at the same time, that is, the corresponding pages are allocated. In the current share pool implementation, the memory is charged to the memcg of the first task added to this share pool group.
This is unreasonable and may cause memcg of first task oom. So, we should charge the pages to the memcg of current task.
Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/shmem.c b/mm/shmem.c index 60644e9b1a7b..d2e86b35c7ff 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1819,7 +1819,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, }
sbinfo = SHMEM_SB(inode->i_sb); - charge_mm = vma ? vma->vm_mm : current->mm; + charge_mm = vma && !sp_check_vm_share_pool(vma->vm_flags) ? vma->vm_mm : current->mm;
page = pagecache_get_page(mapping, index, FGP_ENTRY | FGP_HEAD | FGP_LOCK, 0);
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: Feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NDAW CVE: NA
-------------------
e80000600000-e80000603000 rw-s 00600000 00:05 1025 /sp_group_1 (deleted) Size: 12 kB KernelPageSize: 4 kB MMUPageSize: 4 kB Rss: 0 kB Pss: 0 kB Shared_Clean: 0 kB Shared_Dirty: 0 kB Private_Clean: 0 kB Private_Dirty: 0 kB Referenced: 0 kB Anonymous: 0 kB LazyFree: 0 kB AnonHugePages: 0 kB ShmemPmdMapped: 0 kB Shared_Hugetlb: 0 kB Private_Hugetlb: 0 kB Swap: 0 kB SwapPss: 0 kB Locked: 0 kB THPeligible: 0 VmFlags: rd wr sh mr mw me ms pf io dc de nr dd sp ~~
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/proc/task_mmu.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 464562ad470f..4da09b397370 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -663,6 +663,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) #endif /* CONFIG_ARCH_HAS_PKEYS */ #ifdef CONFIG_USERSWAP [ilog2(VM_USWAP)] = "us", +#endif +#ifdef CONFIG_ASCEND_SHARE_POOL + [ilog2(VM_SHARE_POOL)] = "sp", #endif }; size_t i;
From: Wang Wensheng wangwensheng4@huawei.com
ascend inclusion category: Feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NDAW CVE: NA
-------------------
Sharepool applies for a dedicated interface for large pages, which optimizes the efficiency of memory application
Signed-off-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/hugetlb.h | 4 ++ include/linux/share_pool.h | 12 +++++ mm/hugetlb.c | 10 +++- mm/share_pool.c | 97 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 122 insertions(+), 1 deletion(-)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 397e6bfa8268..1b586a324b8e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -1120,4 +1120,8 @@ static inline __init void hugetlb_cma_check(void) } #endif
+#ifdef CONFIG_ASCEND_SHARE_POOL +pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, int writable); +#endif + #endif /* _LINUX_HUGETLB_H */ diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index ca9fcde90211..6f294911c6af 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -260,6 +260,10 @@ extern int sp_group_add_task(int pid, int spg_id); extern void sp_area_drop(struct vm_area_struct *vma); extern int sp_group_exit(struct mm_struct *mm); extern void sp_group_post_exit(struct mm_struct *mm); +vm_fault_t sharepool_no_page(struct mm_struct *mm, + struct vm_area_struct *vma, + struct address_space *mapping, pgoff_t idx, + unsigned long address, pte_t *ptep, unsigned int flags); extern bool sp_check_addr(unsigned long addr); extern bool sp_check_mmap_addr(unsigned long addr, unsigned long flags); extern int sp_node_id(struct vm_area_struct *vma); @@ -515,6 +519,14 @@ static inline bool sp_check_mmap_addr(unsigned long addr, unsigned long flags) return false; }
+static inline vm_fault_t sharepool_no_page(struct mm_struct *mm, + struct vm_area_struct *vma, + struct address_space *mapping, pgoff_t idx, + unsigned long address, pte_t *ptep, unsigned int flags) +{ + return VM_FAULT_SIGBUS; +} + #endif /* !CONFIG_ASCEND_SHARE_POOL */
#endif /* LINUX_SHARE_POOL_H */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 74d23542f9f2..44691ce8a9b8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3861,8 +3861,13 @@ const struct vm_operations_struct hugetlb_vm_ops = { .pagesize = hugetlb_vm_op_pagesize, };
+#ifdef CONFIG_ASCEND_SHARE_POOL +pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, + int writable) +#else static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, int writable) +#endif { pte_t entry;
@@ -4727,7 +4732,10 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
entry = huge_ptep_get(ptep); if (huge_pte_none(entry)) { - ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags); + if (sp_check_vm_share_pool(vma->vm_flags)) + ret = sharepool_no_page(mm, vma, mapping, idx, address, ptep, flags); + else + ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags); goto out_mutex; }
diff --git a/mm/share_pool.c b/mm/share_pool.c index 05218c6dd250..494a829d6f3a 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -4177,6 +4177,103 @@ bool sp_check_mmap_addr(unsigned long addr, unsigned long flags) return false; }
+vm_fault_t sharepool_no_page(struct mm_struct *mm, + struct vm_area_struct *vma, + struct address_space *mapping, pgoff_t idx, + unsigned long address, pte_t *ptep, unsigned int flags) +{ + struct hstate *h = hstate_vma(vma); + vm_fault_t ret = VM_FAULT_SIGBUS; + unsigned long size; + struct page *page; + pte_t new_pte; + spinlock_t *ptl; + unsigned long haddr = address & huge_page_mask(h); + bool new_page = false; + int err; + int node_id; + struct sp_area *spa; + + spa = __find_sp_area(vma->vm_start); + if (!spa) { + pr_err("share pool: vma is invalid, not from sp mmap\n"); + return ret; + } + node_id = spa->node_id; + __sp_area_drop(spa); + +retry: + page = find_lock_page(mapping, idx); + if (!page) { + size = i_size_read(mapping->host) >> huge_page_shift(h); + if (idx >= size) + goto out; + + page = alloc_huge_page(vma, haddr, 0); + if (IS_ERR(page)) { + page = alloc_huge_page_nodemask(hstate_file(vma->vm_file), + node_id, NULL, GFP_KERNEL); + if (!page) + page = ERR_PTR(-ENOMEM); + } + if (IS_ERR(page)) { + ptl = huge_pte_lock(h, mm, ptep); + if (!huge_pte_none(huge_ptep_get(ptep))) { + ret = 0; + spin_unlock(ptl); + goto out; + } + spin_unlock(ptl); + ret = vmf_error(PTR_ERR(page)); + goto out; + } + __SetPageUptodate(page); + new_page = true; + + /* sharepool pages are all shared */ + err = huge_add_to_page_cache(page, mapping, idx); + if (err) { + put_page(page); + if (err == -EEXIST) + goto retry; + goto out; + } + } + + + ptl = huge_pte_lock(h, mm, ptep); + size = i_size_read(mapping->host) >> huge_page_shift(h); + if (idx >= size) + goto backout; + + ret = 0; + if (!huge_pte_none(huge_ptep_get(ptep))) + goto backout; + + page_dup_rmap(page, true); + new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) + && (vma->vm_flags & VM_SHARED))); + set_huge_pte_at(mm, haddr, ptep, new_pte); + + hugetlb_count_add(pages_per_huge_page(h), mm); + + spin_unlock(ptl); + + if (new_page) { + SetPagePrivate(&page[1]); + } + + unlock_page(page); +out: + return ret; + +backout: + spin_unlock(ptl); + unlock_page(page); + put_page(page); + goto out; +} + #define MM_WOULD_FREE 1
/*