Kernel
Threads by month
- ----- 2025 -----
- May
- April
- March
- February
- January
- ----- 2024 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2023 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2022 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2021 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2020 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2019 -----
- December
- 3 participants
- 17954 discussions

[PATCH openEuler-1.0-LTS] wifi: brcmfmac: slab-out-of-bounds read in brcmf_get_assoc_ies()
by Yongqiang Liu 23 Mar '23
by Yongqiang Liu 23 Mar '23
23 Mar '23
From: Jisoo Jang <jisoo.jang(a)yonsei.ac.kr>
maillist inclusion
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I6NCVX
CVE: CVE-2023-1380
Reference: https://patchwork.kernel.org/project/linux-wireless/patch/20230309104457.22…
--------------------------------
Fix a slab-out-of-bounds read that occurs in kmemdup() called from
brcmf_get_assoc_ies().
The bug could occur when assoc_info->req_len, data from a URB provided
by a USB device, is bigger than the size of buffer which is defined as
WL_EXTRA_BUF_MAX.
Add the size check for req_len/resp_len of assoc_info.
Found by a modified version of syzkaller.
[ 46.592467][ T7] ==================================================================
[ 46.594687][ T7] BUG: KASAN: slab-out-of-bounds in kmemdup+0x3e/0x50
[ 46.596572][ T7] Read of size 3014656 at addr ffff888019442000 by task kworker/0:1/7
[ 46.598575][ T7]
[ 46.599157][ T7] CPU: 0 PID: 7 Comm: kworker/0:1 Tainted: G O 5.14.0+ #145
[ 46.601333][ T7] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014
[ 46.604360][ T7] Workqueue: events brcmf_fweh_event_worker
[ 46.605943][ T7] Call Trace:
[ 46.606584][ T7] dump_stack_lvl+0x8e/0xd1
[ 46.607446][ T7] print_address_description.constprop.0.cold+0x93/0x334
[ 46.608610][ T7] ? kmemdup+0x3e/0x50
[ 46.609341][ T7] kasan_report.cold+0x79/0xd5
[ 46.610151][ T7] ? kmemdup+0x3e/0x50
[ 46.610796][ T7] kasan_check_range+0x14e/0x1b0
[ 46.611691][ T7] memcpy+0x20/0x60
[ 46.612323][ T7] kmemdup+0x3e/0x50
[ 46.612987][ T7] brcmf_get_assoc_ies+0x967/0xf60
[ 46.613904][ T7] ? brcmf_notify_vif_event+0x3d0/0x3d0
[ 46.614831][ T7] ? lock_chain_count+0x20/0x20
[ 46.615683][ T7] ? mark_lock.part.0+0xfc/0x2770
[ 46.616552][ T7] ? lock_chain_count+0x20/0x20
[ 46.617409][ T7] ? mark_lock.part.0+0xfc/0x2770
[ 46.618244][ T7] ? lock_chain_count+0x20/0x20
[ 46.619024][ T7] brcmf_bss_connect_done.constprop.0+0x241/0x2e0
[ 46.620019][ T7] ? brcmf_parse_configure_security.isra.0+0x2a0/0x2a0
[ 46.620818][ T7] ? __lock_acquire+0x181f/0x5790
[ 46.621462][ T7] brcmf_notify_connect_status+0x448/0x1950
[ 46.622134][ T7] ? rcu_read_lock_bh_held+0xb0/0xb0
[ 46.622736][ T7] ? brcmf_cfg80211_join_ibss+0x7b0/0x7b0
[ 46.623390][ T7] ? find_held_lock+0x2d/0x110
[ 46.623962][ T7] ? brcmf_fweh_event_worker+0x19f/0xc60
[ 46.624603][ T7] ? mark_held_locks+0x9f/0xe0
[ 46.625145][ T7] ? lockdep_hardirqs_on_prepare+0x3e0/0x3e0
[ 46.625871][ T7] ? brcmf_cfg80211_join_ibss+0x7b0/0x7b0
[ 46.626545][ T7] brcmf_fweh_call_event_handler.isra.0+0x90/0x100
[ 46.627338][ T7] brcmf_fweh_event_worker+0x557/0xc60
[ 46.627962][ T7] ? brcmf_fweh_call_event_handler.isra.0+0x100/0x100
[ 46.628736][ T7] ? rcu_read_lock_sched_held+0xa1/0xd0
[ 46.629396][ T7] ? rcu_read_lock_bh_held+0xb0/0xb0
[ 46.629970][ T7] ? lockdep_hardirqs_on_prepare+0x273/0x3e0
[ 46.630649][ T7] process_one_work+0x92b/0x1460
[ 46.631205][ T7] ? pwq_dec_nr_in_flight+0x330/0x330
[ 46.631821][ T7] ? rwlock_bug.part.0+0x90/0x90
[ 46.632347][ T7] worker_thread+0x95/0xe00
[ 46.632832][ T7] ? __kthread_parkme+0x115/0x1e0
[ 46.633393][ T7] ? process_one_work+0x1460/0x1460
[ 46.633957][ T7] kthread+0x3a1/0x480
[ 46.634369][ T7] ? set_kthread_struct+0x120/0x120
[ 46.634933][ T7] ret_from_fork+0x1f/0x30
[ 46.635431][ T7]
[ 46.635687][ T7] Allocated by task 7:
[ 46.636151][ T7] kasan_save_stack+0x1b/0x40
[ 46.636628][ T7] __kasan_kmalloc+0x7c/0x90
[ 46.637108][ T7] kmem_cache_alloc_trace+0x19e/0x330
[ 46.637696][ T7] brcmf_cfg80211_attach+0x4a0/0x4040
[ 46.638275][ T7] brcmf_attach+0x389/0xd40
[ 46.638739][ T7] brcmf_usb_probe+0x12de/0x1690
[ 46.639279][ T7] usb_probe_interface+0x2aa/0x760
[ 46.639820][ T7] really_probe+0x205/0xb70
[ 46.640342][ T7] __driver_probe_device+0x311/0x4b0
[ 46.640876][ T7] driver_probe_device+0x4e/0x150
[ 46.641445][ T7] __device_attach_driver+0x1cc/0x2a0
[ 46.642000][ T7] bus_for_each_drv+0x156/0x1d0
[ 46.642543][ T7] __device_attach+0x23f/0x3a0
[ 46.643065][ T7] bus_probe_device+0x1da/0x290
[ 46.643644][ T7] device_add+0xb7b/0x1eb0
[ 46.644130][ T7] usb_set_configuration+0xf59/0x16f0
[ 46.644720][ T7] usb_generic_driver_probe+0x82/0xa0
[ 46.645295][ T7] usb_probe_device+0xbb/0x250
[ 46.645786][ T7] really_probe+0x205/0xb70
[ 46.646258][ T7] __driver_probe_device+0x311/0x4b0
[ 46.646804][ T7] driver_probe_device+0x4e/0x150
[ 46.647387][ T7] __device_attach_driver+0x1cc/0x2a0
[ 46.647926][ T7] bus_for_each_drv+0x156/0x1d0
[ 46.648454][ T7] __device_attach+0x23f/0x3a0
[ 46.648939][ T7] bus_probe_device+0x1da/0x290
[ 46.649478][ T7] device_add+0xb7b/0x1eb0
[ 46.649936][ T7] usb_new_device.cold+0x49c/0x1029
[ 46.650526][ T7] hub_event+0x1c98/0x3950
[ 46.650975][ T7] process_one_work+0x92b/0x1460
[ 46.651535][ T7] worker_thread+0x95/0xe00
[ 46.651991][ T7] kthread+0x3a1/0x480
[ 46.652413][ T7] ret_from_fork+0x1f/0x30
[ 46.652885][ T7]
[ 46.653131][ T7] The buggy address belongs to the object at ffff888019442000
[ 46.653131][ T7] which belongs to the cache kmalloc-2k of size 2048
[ 46.654669][ T7] The buggy address is located 0 bytes inside of
[ 46.654669][ T7] 2048-byte region [ffff888019442000, ffff888019442800)
[ 46.656137][ T7] The buggy address belongs to the page:
[ 46.656720][ T7] page:ffffea0000651000 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x19440
[ 46.657792][ T7] head:ffffea0000651000 order:3 compound_mapcount:0 compound_pincount:0
[ 46.658673][ T7] flags: 0x100000000010200(slab|head|node=0|zone=1)
[ 46.659422][ T7] raw: 0100000000010200 0000000000000000 dead000000000122 ffff888100042000
[ 46.660363][ T7] raw: 0000000000000000 0000000000080008 00000001ffffffff 0000000000000000
[ 46.661236][ T7] page dumped because: kasan: bad access detected
[ 46.661956][ T7] page_owner tracks the page as allocated
[ 46.662588][ T7] page last allocated via order 3, migratetype Unmovable, gfp_mask 0x52a20(GFP_ATOMIC|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP), pid 7, ts 31136961085, free_ts 0
[ 46.664271][ T7] prep_new_page+0x1aa/0x240
[ 46.664763][ T7] get_page_from_freelist+0x159a/0x27c0
[ 46.665340][ T7] __alloc_pages+0x2da/0x6a0
[ 46.665847][ T7] alloc_pages+0xec/0x1e0
[ 46.666308][ T7] allocate_slab+0x380/0x4e0
[ 46.666770][ T7] ___slab_alloc+0x5bc/0x940
[ 46.667264][ T7] __slab_alloc+0x6d/0x80
[ 46.667712][ T7] kmem_cache_alloc_trace+0x30a/0x330
[ 46.668299][ T7] brcmf_usbdev_qinit.constprop.0+0x50/0x470
[ 46.668885][ T7] brcmf_usb_probe+0xc97/0x1690
[ 46.669438][ T7] usb_probe_interface+0x2aa/0x760
[ 46.669988][ T7] really_probe+0x205/0xb70
[ 46.670487][ T7] __driver_probe_device+0x311/0x4b0
[ 46.671031][ T7] driver_probe_device+0x4e/0x150
[ 46.671604][ T7] __device_attach_driver+0x1cc/0x2a0
[ 46.672192][ T7] bus_for_each_drv+0x156/0x1d0
[ 46.672739][ T7] page_owner free stack trace missing
[ 46.673335][ T7]
[ 46.673620][ T7] Memory state around the buggy address:
[ 46.674213][ T7] ffff888019442700: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[ 46.675083][ T7] ffff888019442780: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[ 46.675994][ T7] >ffff888019442800: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 46.676875][ T7] ^
[ 46.677323][ T7] ffff888019442880: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 46.678190][ T7] ffff888019442900: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 46.679052][ T7] ==================================================================
[ 46.679945][ T7] Disabling lock debugging due to kernel taint
[ 46.680725][ T7] Kernel panic - not syncing:
Reviewed-by: Arend van Spriel <arend.vanspriel(a)broadcom.com>
Signed-off-by: Jisoo Jang <jisoo.jang(a)yonsei.ac.kr>
Signed-off-by: Kalle Valo <kvalo(a)kernel.org>
Link: https://lore.kernel.org/r/20230309104457.22628-1-jisoo.jang@yonsei.ac.kr
Signed-off-by: Baisong Zhong <zhongbaisong(a)huawei.com>
Reviewed-by: Liu Jian <liujian56(a)huawei.com>
Reviewed-by: Xiu Jianfeng <xiujianfeng(a)huawei.com>
Signed-off-by: Yongqiang Liu <liuyongqiang13(a)huawei.com>
---
drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
index bbdc6000afb9..4e1bd049dd06 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
@@ -5356,6 +5356,11 @@ static s32 brcmf_get_assoc_ies(struct brcmf_cfg80211_info *cfg,
(struct brcmf_cfg80211_assoc_ielen_le *)cfg->extra_buf;
req_len = le32_to_cpu(assoc_info->req_len);
resp_len = le32_to_cpu(assoc_info->resp_len);
+ if (req_len > WL_EXTRA_BUF_MAX || resp_len > WL_EXTRA_BUF_MAX) {
+ brcmf_err("invalid lengths in assoc info: req %u resp %u\n",
+ req_len, resp_len);
+ return -EINVAL;
+ }
if (req_len) {
err = brcmf_fil_iovar_data_get(ifp, "assoc_req_ies",
cfg->extra_buf,
--
2.25.1
1
0

[RFC PATCH openEuler-1.0-LTS v2] sched: memqos: add memqos for dynamic affinity
by Wang ShaoBo 23 Mar '23
by Wang ShaoBo 23 Mar '23
23 Mar '23
Add debug memband interface to dynamic affinity, this would be
useful for those threads sensitive to memory bandwidth.
Signed-off-by: Wang ShaoBo <bobo.shaobowang(a)huawei.com>
v2: Fix update thread's mpamid failed.
---
arch/arm64/include/asm/mpam.h | 2 +
arch/arm64/include/asm/mpam_sched.h | 2 +
arch/arm64/kernel/mpam/mpam_device.c | 58 ++-
arch/arm64/kernel/mpam/mpam_resctrl.c | 37 ++
arch/arm64/kernel/process.c | 2 +-
include/linux/memqos.h | 142 +++++++
include/linux/sched.h | 15 +-
include/linux/sysctl.h | 2 +
kernel/cgroup/cpuset.c | 1 +
kernel/exit.c | 3 +
kernel/fork.c | 4 +
kernel/sched/Makefile | 1 +
kernel/sched/core.c | 52 ++-
kernel/sched/fair.c | 14 +-
kernel/sched/memqos/Makefile | 6 +
kernel/sched/memqos/memqos.c | 297 +++++++++++++++
kernel/sched/memqos/phase_feature_sysctl.c | 183 +++++++++
kernel/sched/memqos/phase_memband.c | 179 +++++++++
kernel/sched/memqos/phase_perf.c | 412 +++++++++++++++++++++
kernel/sched/memqos/phase_sim_knn.c | 92 +++++
kernel/sysctl.c | 7 +
mm/mempolicy.c | 10 +-
22 files changed, 1500 insertions(+), 21 deletions(-)
create mode 100644 include/linux/memqos.h
create mode 100644 kernel/sched/memqos/Makefile
create mode 100644 kernel/sched/memqos/memqos.c
create mode 100644 kernel/sched/memqos/phase_feature_sysctl.c
create mode 100644 kernel/sched/memqos/phase_memband.c
create mode 100644 kernel/sched/memqos/phase_perf.c
create mode 100644 kernel/sched/memqos/phase_sim_knn.c
diff --git a/arch/arm64/include/asm/mpam.h b/arch/arm64/include/asm/mpam.h
index 6338eab817e75..269a91d8ca907 100644
--- a/arch/arm64/include/asm/mpam.h
+++ b/arch/arm64/include/asm/mpam.h
@@ -4,6 +4,8 @@
#ifdef CONFIG_MPAM
extern int mpam_rmid_to_partid_pmg(int rmid, int *partid, int *pmg);
+
+void mpam_component_config_mbwu_mon(int partid, int pmg, int monitor, int *result, int nr);
#endif
#endif /* _ASM_ARM64_MPAM_H */
diff --git a/arch/arm64/include/asm/mpam_sched.h b/arch/arm64/include/asm/mpam_sched.h
index 08ed349b6efa1..32d08cf654b31 100644
--- a/arch/arm64/include/asm/mpam_sched.h
+++ b/arch/arm64/include/asm/mpam_sched.h
@@ -40,6 +40,8 @@ static inline void mpam_sched_in(void)
__mpam_sched_in();
}
+void __mpam_sched_in_v2(struct task_struct *tsk);
+
#else
static inline void mpam_sched_in(void) {}
diff --git a/arch/arm64/kernel/mpam/mpam_device.c b/arch/arm64/kernel/mpam/mpam_device.c
index 6455c69f132fd..48de3982a0b9a 100644
--- a/arch/arm64/kernel/mpam/mpam_device.c
+++ b/arch/arm64/kernel/mpam/mpam_device.c
@@ -84,14 +84,14 @@ void mpam_class_list_lock_held(void)
static inline u32 mpam_read_reg(struct mpam_device *dev, u16 reg)
{
WARN_ON_ONCE(reg > SZ_MPAM_DEVICE);
- assert_spin_locked(&dev->lock);
+ //assert_spin_locked(&dev->lock);
/*
* If we touch a device that isn't accessible from this CPU we may get
* an external-abort.
*/
- WARN_ON_ONCE(preemptible());
- WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &dev->fw_affinity));
+ //WARN_ON_ONCE(preemptible());
+ //WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &dev->fw_affinity));
return readl_relaxed(dev->mapped_hwpage + reg);
}
@@ -99,14 +99,14 @@ static inline u32 mpam_read_reg(struct mpam_device *dev, u16 reg)
static inline void mpam_write_reg(struct mpam_device *dev, u16 reg, u32 val)
{
WARN_ON_ONCE(reg > SZ_MPAM_DEVICE);
- assert_spin_locked(&dev->lock);
+ //assert_spin_locked(&dev->lock);
/*
* If we touch a device that isn't accessible from this CPU we may get
* an external-abort. If we're lucky, we corrupt another mpam:component.
*/
- WARN_ON_ONCE(preemptible());
- WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &dev->fw_affinity));
+ //WARN_ON_ONCE(preemptible());
+ //WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &dev->fw_affinity));
writel_relaxed(val, dev->mapped_hwpage + reg);
}
@@ -1208,6 +1208,7 @@ static u32 mpam_device_read_mbwu_mon(struct mpam_device *dev,
{
u16 mon;
u32 clt, flt, cur_clt, cur_flt;
+ u32 total = 0;
mon = args->mon;
@@ -1249,7 +1250,12 @@ static u32 mpam_device_read_mbwu_mon(struct mpam_device *dev,
wmb();
}
- return mpam_read_reg(dev, MSMON_MBWU);
+ total += mpam_read_reg(dev, MSMON_MBWU);
+ total += mpam_read_reg(dev, MSMON_MBWU);
+ total += mpam_read_reg(dev, MSMON_MBWU);
+ total += mpam_read_reg(dev, MSMON_MBWU);
+ total += mpam_read_reg(dev, MSMON_MBWU);
+ return total / 5;
}
static int mpam_device_frob_mon(struct mpam_device *dev,
@@ -1470,6 +1476,44 @@ static void mpam_component_device_sync(void *__ctx)
cpumask_set_cpu(smp_processor_id(), &ctx->updated_on);
}
+static DEFINE_SPINLOCK(mpam_tmp_lock);
+
+void mpam_component_config_mbwu_mon(int partid, int pmg, int monitor, int *result, int nr)
+{
+ struct mpam_class *class;
+ struct mpam_component *comp;
+ struct mpam_device *dev;
+ struct sync_args args;
+ int i = 0;
+
+ args.pmg = pmg;
+ args.mon = monitor;
+ args.closid.reqpartid = partid;
+ args.match_pmg = 1;
+
+ spin_lock(&mpam_tmp_lock);
+ list_for_each_entry(class, &mpam_classes, classes_list) {
+ if (class->type != MPAM_CLASS_MEMORY)
+ continue;
+
+ list_for_each_entry(comp, &class->components, class_list) {
+ if (i >= nr) {
+ pr_err_once("error, i > result nr");
+ break;
+ }
+ result[i] = 0;
+ list_for_each_entry(dev, &comp->devices, comp_list) {
+ result[i] += mpam_device_read_mbwu_mon(dev, &args);
+ }
+ i++;
+ }
+ break;
+ }
+ spin_unlock(&mpam_tmp_lock);
+
+}
+EXPORT_SYMBOL(mpam_component_config_mbwu_mon);
+
/**
* in some cases/platforms the MSC register access is only possible with
* the associated CPUs. And need to check if those CPUS are online before
diff --git a/arch/arm64/kernel/mpam/mpam_resctrl.c b/arch/arm64/kernel/mpam/mpam_resctrl.c
index 60d3d8706a38b..26258f7508ac4 100644
--- a/arch/arm64/kernel/mpam/mpam_resctrl.c
+++ b/arch/arm64/kernel/mpam/mpam_resctrl.c
@@ -2226,6 +2226,43 @@ int mpam_resctrl_init(void)
return resctrl_group_init();
}
+
+void __mpam_sched_in_v2(struct task_struct *tsk)
+{
+ struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+ u64 rmid = state->default_rmid;
+ u64 closid = state->default_closid;
+
+ /*
+ * If this task has a closid/rmid assigned, use it.
+ * Else use the closid/rmid assigned to this cpu.
+ */
+ if (tsk->closid)
+ closid = tsk->closid;
+
+ if (tsk->rmid)
+ rmid = tsk->rmid;
+
+ if (closid != state->cur_closid || rmid != state->cur_rmid) {
+ u64 reg;
+
+ /* set in EL0 */
+ reg = mpam_read_sysreg_s(SYS_MPAM0_EL1, "SYS_MPAM0_EL1");
+ reg = PARTID_SET(reg, closid);
+ reg = PMG_SET(reg, rmid);
+ mpam_write_sysreg_s(reg, SYS_MPAM0_EL1, "SYS_MPAM0_EL1");
+
+ /* set in EL1 */
+ reg = mpam_read_sysreg_s(SYS_MPAM1_EL1, "SYS_MPAM1_EL1");
+ reg = PARTID_SET(reg, closid);
+ reg = PMG_SET(reg, rmid);
+ mpam_write_sysreg_s(reg, SYS_MPAM1_EL1, "SYS_MPAM1_EL1");
+
+ state->cur_rmid = rmid;
+ state->cur_closid = closid;
+ }
+}
+
/*
* __intel_rdt_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR
*
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index e5be78915632c..7896bb74ecc49 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -531,7 +531,7 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev,
/* the actual thread switch */
last = cpu_switch_to(prev, next);
- mpam_sched_in();
+ //mpam_sched_in();
return last;
}
diff --git a/include/linux/memqos.h b/include/linux/memqos.h
new file mode 100644
index 0000000000000..814e9935590d3
--- /dev/null
+++ b/include/linux/memqos.h
@@ -0,0 +1,142 @@
+#ifndef _MEMQOS_H
+#define _MEMQOS_H
+
+#include <linux/vmstat.h>
+#include <linux/rbtree.h>
+//#include <linux/sched.h>
+
+struct task_struct;
+
+struct memqos_domain {
+ int dom_id;
+ int total_memband_div_10;
+ int total_out_memband_div_10;
+
+ //record 10 timers
+ int memband_ringpos;
+ int memband_div_10_history[4][10];
+};
+
+struct memqos_mpam_profile {
+ int partid;
+ int pmg;
+ int monitor;
+
+ struct task_struct *tsk;
+ int used;
+};
+
+struct memqos_wait_profile {
+ struct memqos_mpam_profile *profile;
+ struct list_head wait_list;
+};
+
+struct memqos_class {
+ struct list_head turbo_list;
+ struct list_head tasks_list;
+};
+
+#include <linux/topology.h>
+//embed in task_struct
+
+struct task_memqos {
+ int ipc_ringpos;
+ int ipcx10;
+ int ipcx10_total[4];
+ int ipcx10_history[10];
+
+ int memband_div_10;
+ int memband_ringpos;
+ int memband_div_10_total[4];
+ int memband_div_10_history[4][10];
+
+ u32 sample_times;
+ int account_ready;
+ int numa_score[4];
+ int turbo;
+
+ struct memqos_wait_profile mpam_profile;
+
+ struct list_head turbo_list;
+ struct list_head task_list;
+
+ struct cpumask *advise_mem_node_mask;
+ int preferred_nid;
+
+ int class_id;
+
+ int corrupt;
+};
+
+#define PHASE_PEVENT_NUM 10
+
+struct phase_event_pcount {
+ u64 data[PHASE_PEVENT_NUM];
+};
+
+struct phase_event_count {
+ struct phase_event_pcount pcount;
+};
+
+void phase_update_mpam_label(struct task_struct *tsk);
+
+void phase_release_mpam_label(struct task_struct *tsk);
+
+static inline void memqos_update_mpam_label(struct task_struct *tsk)
+{
+ phase_update_mpam_label(tsk);
+}
+
+static inline void memqos_release_mpam_label(struct task_struct *tsk)
+{
+ phase_release_mpam_label(tsk);
+}
+
+void phase_destroy_waitqueue(struct task_struct *tsk);
+
+void phase_get_memband(struct memqos_mpam_profile *pm, int *result, int nr);
+
+DECLARE_STATIC_KEY_FALSE(sched_phase);
+DECLARE_STATIC_KEY_FALSE(sched_phase_printk);
+
+int phase_perf_create(void);
+
+void phase_perf_release(void);
+
+void memqos_account_task(struct task_struct *p, int cpu);
+
+void memqos_drop_class(struct task_struct *p);
+
+void phase_account_task(struct task_struct *p, int cpu);
+
+static inline void memqos_task_collect_data(struct task_struct *p, int cpu)
+{
+ phase_account_task(p, cpu);
+}
+
+static inline void memqos_task_account(struct task_struct *p, int cpu)
+{
+ memqos_account_task(p, cpu);
+}
+
+static inline void memqos_task_exit(struct task_struct *p)
+{
+
+ memqos_drop_class(p);
+ phase_destroy_waitqueue(p);
+}
+
+void memqos_select_nicest_cpus(struct task_struct *p);
+
+void memqos_exclude_low_level_task_single(struct task_struct *p);
+
+int knn_get_tag(int ipcx10, int memband_div_10);
+
+void memqos_init_class(struct task_struct *p);
+
+void phase_trace_printk(struct task_struct *p);
+static inline void memqos_trace_printk(struct task_struct *p)
+{
+ phase_trace_printk(p);
+}
+#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 928186f161000..c5b74cd0c5830 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -29,6 +29,7 @@
#include <linux/task_io_accounting.h>
#include <linux/rseq.h>
#include <linux/thread_bits.h>
+#include <linux/memqos.h>
/* task_struct member predeclarations (sorted alphabetically): */
struct audit_context;
@@ -1268,7 +1269,7 @@ struct task_struct {
#if !defined(__GENKSYMS__)
#if defined(CONFIG_QOS_SCHED_DYNAMIC_AFFINITY)
cpumask_t *prefer_cpus;
- const cpumask_t *select_cpus;
+ cpumask_t *select_cpus;
#else
KABI_RESERVE(6)
KABI_RESERVE(7)
@@ -1279,6 +1280,10 @@ struct task_struct {
#endif
KABI_RESERVE(8)
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+ struct task_memqos sched_memqos;
+#endif
+
/* CPU-specific state of this task: */
struct thread_struct thread;
@@ -1998,6 +2003,14 @@ int set_prefer_cpus_ptr(struct task_struct *p,
const struct cpumask *new_mask);
int sched_prefer_cpus_fork(struct task_struct *p, struct task_struct *orig);
void sched_prefer_cpus_free(struct task_struct *p);
+static inline bool prefer_cpus_valid(struct task_struct *p)
+{
+ return p->prefer_cpus &&
+ !cpumask_empty(p->prefer_cpus) &&
+ !cpumask_equal(p->prefer_cpus, &p->cpus_allowed) &&
+ cpumask_subset(p->prefer_cpus, &p->cpus_allowed);
+}
+void sched_memqos_task_collect_data_range(int start_cpu, int end_cpu);
#endif
#endif
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index b769ecfcc3bd4..73bce39107cb3 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -230,6 +230,8 @@ static inline void setup_sysctl_set(struct ctl_table_set *p,
#endif /* CONFIG_SYSCTL */
+extern struct ctl_table phase_table[];
+
int sysctl_max_threads(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 55bfbc4cdb16c..d94a9065a5605 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -106,6 +106,7 @@ struct cpuset {
nodemask_t mems_allowed;
#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
cpumask_var_t prefer_cpus;
+ int mem_turbo;
#endif
/* effective CPUs and Memory Nodes allow to tasks */
diff --git a/kernel/exit.c b/kernel/exit.c
index 2a32d32bdc03d..b731c19618176 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -699,6 +699,8 @@ static void check_stack_usage(void)
static inline void check_stack_usage(void) {}
#endif
+#include <linux/memqos.h>
+
void __noreturn do_exit(long code)
{
struct task_struct *tsk = current;
@@ -806,6 +808,7 @@ void __noreturn do_exit(long code)
* because of cgroup mode, must be called before cgroup_exit()
*/
perf_event_exit_task(tsk);
+ memqos_task_exit(tsk);
sched_autogroup_exit_task(tsk);
cgroup_exit(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index b5453a26655e2..0a762b92dc814 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -841,6 +841,8 @@ void set_task_stack_end_magic(struct task_struct *tsk)
*stackend = STACK_END_MAGIC; /* for overflow detection */
}
+
+#include <linux/memqos.h>
static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
struct task_struct *tsk;
@@ -923,6 +925,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
kcov_task_init(tsk);
+ memqos_init_class(tsk);
+
#ifdef CONFIG_FAULT_INJECTION
tsk->fail_nth = 0;
#endif
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 7fe183404c383..471380d6686e3 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -29,3 +29,4 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq.o
obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
obj-$(CONFIG_MEMBARRIER) += membarrier.o
obj-$(CONFIG_CPU_ISOLATION) += isolation.o
+obj-$(CONFIG_QOS_SCHED_DYNAMIC_AFFINITY) += memqos/
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 970616070da86..15c7e1e3408cb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2787,6 +2787,8 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
calculate_sigpending();
}
+#include <linux/memqos.h>
+
/*
* context_switch - switch to the new MM and the new thread's register state.
*/
@@ -2794,6 +2796,8 @@ static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next, struct rq_flags *rf)
{
+ struct rq *ret;
+
prepare_task_switch(rq, prev, next);
/*
@@ -2837,6 +2841,18 @@ context_switch(struct rq *rq, struct task_struct *prev,
}
}
+ //account and release
+ memqos_task_account(prev, smp_processor_id());
+
+ if (prefer_cpus_valid(prev))
+ memqos_trace_printk(prev);
+
+ memqos_release_mpam_label(prev);
+
+ //label new task's mpamid
+ if (prefer_cpus_valid(next))
+ memqos_update_mpam_label(next);
+
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
prepare_lock_switch(rq, next, rf);
@@ -2845,7 +2861,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
switch_to(prev, next, prev);
barrier();
- return finish_task_switch(prev);
+ ret = finish_task_switch(prev);
+
+ return ret;
}
/*
@@ -3051,6 +3069,20 @@ unsigned long long task_sched_runtime(struct task_struct *p)
return ns;
}
+void sched_memqos_task_collect_data_range(int start_cpu, int end_cpu)
+{
+ int cpu;
+ struct task_struct *curr;
+ struct rq *rq_curr;
+
+ for (cpu = start_cpu; cpu <= end_cpu; cpu++) {
+ rq_curr = cpu_rq(cpu);
+ curr = rq_curr->curr;
+ if (curr && prefer_cpus_valid(curr))
+ memqos_task_collect_data(curr, cpu);
+ }
+}
+
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
@@ -3058,8 +3090,12 @@ unsigned long long task_sched_runtime(struct task_struct *p)
void scheduler_tick(void)
{
int cpu = smp_processor_id();
+ //memqos clooect next cpu's memband and perf
+ //int cpu_memqos = (cpu + 1) % nr_cpu_ids;
struct rq *rq = cpu_rq(cpu);
+ //struct rq *rq_next = cpu_rq(cpu_memqos);
struct task_struct *curr = rq->curr;
+ //struct task_struct *curr_memqos = rq_next->curr;
struct rq_flags rf;
sched_clock_tick();
@@ -3075,6 +3111,10 @@ void scheduler_tick(void)
perf_event_task_tick();
+ //only monitor task enabled dynamic affinity
+ //if (curr_memqos && prefer_cpus_valid(curr_memqos))
+ // memqos_task_collect_data(curr_memqos, cpu_memqos);
+
#ifdef CONFIG_SMP
rq->idle_balance = idle_cpu(cpu);
trigger_load_balance(rq);
@@ -3524,6 +3564,16 @@ static void __sched notrace __schedule(bool preempt)
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf);
} else {
+ memqos_task_account(prev, smp_processor_id());
+
+ if (prefer_cpus_valid(prev))
+ memqos_trace_printk(prev);
+
+ memqos_release_mpam_label(prev);
+ //relabel this task's mpamid
+ if (prefer_cpus_valid(prev))
+ memqos_update_mpam_label(prev);
+
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
rq_unlock_irq(rq, &rf);
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index af55a26d11fcb..12e9675495d2c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6675,6 +6675,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
}
#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+#include <linux/memqos.h>
/*
* Low utilization threshold for CPU
*
@@ -6749,14 +6750,6 @@ static inline int cpu_vutil_of(int cpu)
return cputime->vutil;
}
-static inline bool prefer_cpus_valid(struct task_struct *p)
-{
- return p->prefer_cpus &&
- !cpumask_empty(p->prefer_cpus) &&
- !cpumask_equal(p->prefer_cpus, &p->cpus_allowed) &&
- cpumask_subset(p->prefer_cpus, &p->cpus_allowed);
-}
-
/*
* set_task_select_cpus: select the cpu range for task
* @p: the task whose available cpu range will to set
@@ -6828,8 +6821,13 @@ static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu,
if (util_avg_sum < sysctl_sched_util_low_pct *
cpumask_weight(p->prefer_cpus)) {
p->select_cpus = p->prefer_cpus;
+ memqos_select_nicest_cpus(p);
if (sd_flag & SD_BALANCE_WAKE)
schedstat_inc(p->se.dyn_affi_stats->nr_wakeups_preferred_cpus);
+ } else {
+ //select trubo task
+ //select low class task
+ memqos_exclude_low_level_task_single(p);
}
}
#endif
diff --git a/kernel/sched/memqos/Makefile b/kernel/sched/memqos/Makefile
new file mode 100644
index 0000000000000..ed8f42649a8a7
--- /dev/null
+++ b/kernel/sched/memqos/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+# These files are disabled because they produce non-interesting flaky coverage
+# that is not a function of syscall inputs. E.g. involuntary context switches.
+KCOV_INSTRUMENT := n
+
+obj-y := memqos.o phase_feature_sysctl.o phase_memband.o phase_perf.o phase_sim_knn.o
diff --git a/kernel/sched/memqos/memqos.c b/kernel/sched/memqos/memqos.c
new file mode 100644
index 0000000000000..29fc6af1f02c1
--- /dev/null
+++ b/kernel/sched/memqos/memqos.c
@@ -0,0 +1,297 @@
+#include <linux/memqos.h>
+#include <linux/cpumask.h>
+#include <linux/sched.h>
+
+static void memqos_set_task_classid(struct task_struct *p)
+{
+ int class_id;
+ int memband_div_10 = p->sched_memqos.memband_div_10;
+ int ipcx10 = p->sched_memqos.ipcx10;
+
+ class_id = knn_get_tag((u64)ipcx10, (u64)memband_div_10);
+ p->sched_memqos.class_id = class_id;
+}
+
+//static memqos_domain mq_domains[] = {
+// {.dom_id = 0, .total_memband = 0, .total_out_memband = 0,},
+// {.dom_id = 1, .total_memband = 0, .total_out_memband = 0,},
+// {.dom_id = 2, .total_memband = 0, .total_out_memband = 0,},
+// {.dom_id = 3, .total_memband = 0, .total_out_memband = 0,},
+//};
+
+static DEFINE_PER_CPU(struct memqos_class, memqos_classes[8]);
+//static DEFINE_PER_CPU(spinlock_t, memqos_class_lock);
+static DEFINE_SPINLOCK(memqos_class_lock);
+
+static int memqos_class_online(unsigned int cpu)
+{
+ int class_id = 0;
+ struct memqos_class *class;
+
+ for (class_id = 0; class_id < 8; class_id++) {
+ class = &per_cpu(memqos_classes, cpu)[class_id];
+ INIT_LIST_HEAD(&class->tasks_list);
+ INIT_LIST_HEAD(&class->turbo_list);
+ }
+ return 0;
+}
+
+static int memqos_class_offline(unsigned int cpu)
+{
+ return 0;
+}
+
+#include <linux/cpu.h>
+#include <linux/cacheinfo.h>
+
+static void memqos_init(void)
+{
+ int cpuhp_state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+ "memqos:online", memqos_class_online,
+ memqos_class_offline);
+ if (cpuhp_state <= 0) {
+ pr_err("Failed to register 'dyn' cpuhp callbacks");
+ return;
+ }
+}
+late_initcall(memqos_init);
+
+static void memqos_insert_to_class(struct task_struct *p, int cpu)
+{
+ unsigned long flag;
+ int class_id = p->sched_memqos.class_id;
+ struct memqos_class *class;
+ struct task_memqos *memqos;
+
+ if (class_id >= 8)
+ return;
+
+ memqos = &p->sched_memqos;
+
+ class = &per_cpu(memqos_classes, cpu)[class_id];
+
+ spin_lock_irqsave(&memqos_class_lock, flag);
+ if (p->sched_memqos.corrupt) {
+ spin_unlock_irqrestore(&memqos_class_lock, flag);
+ return;
+ }
+
+ list_move_tail(&p->sched_memqos.task_list, &class->tasks_list);
+ if (memqos->turbo)
+ list_move_tail(&p->sched_memqos.turbo_list, &class->turbo_list);
+ spin_unlock_irqrestore(&memqos_class_lock, flag);
+}
+
+static void memqos_drop_class_without_lock(struct task_struct *p)
+{
+ list_del_init(&p->sched_memqos.task_list);
+ list_del_init(&p->sched_memqos.turbo_list);
+}
+
+static void memqos_score(struct task_struct *p)
+{
+ int total_n1 = p->sched_memqos.memband_div_10_total[0];
+ int total_n2 = p->sched_memqos.memband_div_10_total[1];
+ int total_n3 = p->sched_memqos.memband_div_10_total[2];
+ int total_n4 = p->sched_memqos.memband_div_10_total[3];
+
+ p->sched_memqos.numa_score[0] = (total_n1 - (total_n2 + total_n3 + total_n4)) * 10 / total_n1;
+ p->sched_memqos.numa_score[1] = (total_n2 - (total_n1 + total_n3 + total_n4)) * 10 / total_n2;
+ p->sched_memqos.numa_score[2] = (total_n3 - (total_n1 + total_n2 + total_n4)) * 10 / total_n3;
+ p->sched_memqos.numa_score[3] = (total_n4 - (total_n1 + total_n2 + total_n3)) * 10 / total_n4;
+
+ //over x% percent
+ if (p->sched_memqos.numa_score[0] > 0)
+ p->sched_memqos.turbo = 1;
+ else if (p->sched_memqos.numa_score[1] > 0)
+ p->sched_memqos.turbo = 2;
+ else if (p->sched_memqos.numa_score[2] > 0)
+ p->sched_memqos.turbo = 3;
+ else if (p->sched_memqos.numa_score[3] > 0)
+ p->sched_memqos.turbo = 4;
+ else
+ p->sched_memqos.turbo = 0;
+}
+
+void memqos_account_task(struct task_struct *p, int cpu)
+{
+ if (!p->sched_memqos.account_ready ||
+ p->sched_memqos.corrupt)
+ return;
+ memqos_set_task_classid(p);
+ memqos_insert_to_class(p, cpu);
+ memqos_score(p);
+ p->sched_memqos.account_ready = 0;
+}
+
+void memqos_init_class(struct task_struct *p)
+{
+ memset(&p->sched_memqos, 0, sizeof(struct task_memqos));
+ spin_lock(&memqos_class_lock);
+ INIT_LIST_HEAD(&p->sched_memqos.task_list);
+ INIT_LIST_HEAD(&p->sched_memqos.turbo_list);
+ INIT_LIST_HEAD(&p->sched_memqos.mpam_profile.wait_list);
+ spin_unlock(&memqos_class_lock);
+
+ p->closid = 0;
+ p->rmid = 0;
+}
+
+//destroy ?
+void memqos_drop_class(struct task_struct *p)
+{
+ spin_lock(&memqos_class_lock);
+ memqos_drop_class_without_lock(p);
+ p->sched_memqos.corrupt = 1;
+ spin_unlock(&memqos_class_lock);
+}
+
+void memqos_select_nicest_cpus(struct task_struct *p)
+{
+ int i = 0;
+ int max_score = -10000;
+ int select_node = 0;
+ struct task_memqos *memqos = &p->sched_memqos;
+
+ if (!memqos->turbo) {
+ for (i = 0; i < 4; i++) {
+ if (!cpumask_intersects(cpumask_of_node(i), p->select_cpus))
+ continue;
+
+ if (memqos->numa_score[i] > max_score) {
+ select_node = i;
+ max_score = memqos->numa_score[i];
+ }
+ }
+
+ cpumask_and(p->select_cpus, p->select_cpus, cpumask_of_node(select_node));
+ //p->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node);
+ p->sched_memqos.preferred_nid = memqos->turbo;
+ return;
+ }
+
+ select_node = memqos->turbo - 1;
+ if (cpumask_intersects(cpumask_of_node(select_node), p->select_cpus)) {
+ cpumask_and(p->select_cpus, p->select_cpus, cpumask_of_node(select_node));
+ //p->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node);
+ p->sched_memqos.preferred_nid = memqos->turbo;
+ }
+
+ //if turbo another cpus, wait...
+ return;
+}
+
+void memqos_exclude_low_level_task_single(struct task_struct *p)
+{
+ int i, j, cpu;
+ int find = 0;
+ int select_node = 0;
+ const struct cpumask *cpumask;
+ struct cpumask cpumask_med;
+ struct memqos_class *class;
+ struct task_memqos *memqos = &p->sched_memqos;;
+ struct task_struct *tsk = NULL;
+ int max_score = -100000;
+
+ if (memqos->turbo) {
+ select_node = memqos->turbo - 1;
+ cpumask = cpumask_of_node(select_node);
+ if (!cpumask_intersects(cpumask, p->prefer_cpus) &&
+ (cpumask_intersects(&p->cpus_allowed, cpumask))) {
+ cpumask_and(p->select_cpus, &p->cpus_allowed, cpumask);
+ //go out!
+ spin_lock(&memqos_class_lock);
+ memqos_drop_class_without_lock(p);
+ spin_unlock(&memqos_class_lock);
+ //p->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node);
+ p->sched_memqos.preferred_nid = memqos->turbo;
+ return;
+ } else if (cpumask_intersects(p->prefer_cpus, cpumask)) {
+ cpumask_and(p->select_cpus, p->prefer_cpus, cpumask);
+ //p->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node);
+ p->sched_memqos.preferred_nid = memqos->turbo;
+ }
+ }
+
+ //select turbo one
+ for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
+ if (!cpumask_test_cpu(cpu, p->prefer_cpus))
+ continue;
+
+ spin_lock(&memqos_class_lock);
+ for (i = 7; i >= 0; i--) {
+ class = &per_cpu(memqos_classes, cpu)[i];
+ list_for_each_entry(memqos, &class->turbo_list, turbo_list) {
+ if (!memqos->turbo)
+ continue;
+ select_node = memqos->turbo - 1;
+ cpumask = cpumask_of_node(select_node);
+ if (!cpumask_intersects(cpumask, p->prefer_cpus)) {
+ tsk = container_of(memqos, struct task_struct, sched_memqos);
+ if (!cpumask_intersects(cpumask, &tsk->cpus_allowed))
+ continue;
+ cpumask_and(tsk->select_cpus, &tsk->cpus_allowed, cpumask);
+ //mem prefered
+ //tsk->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node);
+ tsk->sched_memqos.preferred_nid = memqos->turbo;
+ find = 1;
+ break;
+ }
+ }
+ if (find) {
+ memqos_drop_class_without_lock(tsk);
+ spin_unlock(&memqos_class_lock);
+ return;
+ }
+ }
+ spin_unlock(&memqos_class_lock);
+ }
+
+ find = 0;
+
+ //if not, select lower class's tsk
+ for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
+ if (!cpumask_test_cpu(cpu, p->prefer_cpus))
+ continue;
+
+ spin_lock(&memqos_class_lock);
+ //only find below class tsk
+ for (i = 0; i < memqos->class_id; i++) {
+ class = &per_cpu(memqos_classes, cpu)[i];
+ list_for_each_entry(memqos, &class->tasks_list, task_list) {
+ if (memqos->turbo)
+ continue;
+
+ tsk = container_of(memqos, struct task_struct, sched_memqos);
+ for (j = 0; j < 4; j++) {
+ if (!cpumask_intersects(cpumask_of_node(i), &tsk->cpus_allowed))
+ continue;
+ if (memqos->numa_score[j] > max_score) {
+ select_node = j;
+ max_score = memqos->numa_score[j];
+ }
+ find = 1;
+ }
+ if (!find)
+ continue;
+
+ cpumask_and(&cpumask_med, cpumask_of_node(select_node), &tsk->cpus_allowed);
+ cpumask_andnot(&cpumask_med, &cpumask_med, p->prefer_cpus);
+ if (cpumask_empty(&cpumask_med))
+ continue;
+ cpumask_copy(tsk->select_cpus, &cpumask_med);
+ //mem prefered
+ //tsk->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node);
+ tsk->sched_memqos.preferred_nid = memqos->turbo;
+ memqos_drop_class_without_lock(tsk);
+ spin_unlock(&memqos_class_lock);
+ return;
+ }
+ }
+ spin_unlock(&memqos_class_lock);
+ }
+
+ //do not care, this task may out
+ return;
+}
+
diff --git a/kernel/sched/memqos/phase_feature_sysctl.c b/kernel/sched/memqos/phase_feature_sysctl.c
new file mode 100644
index 0000000000000..9106a90868a3d
--- /dev/null
+++ b/kernel/sched/memqos/phase_feature_sysctl.c
@@ -0,0 +1,183 @@
+#include <linux/sched.h>
+#include <linux/sysctl.h>
+#include <linux/capability.h>
+#include <linux/cpumask.h>
+#include <linux/topology.h>
+#include <linux/sched/task.h>
+
+#include <linux/memqos.h>
+
+#ifdef CONFIG_PROC_SYSCTL
+
+//setup timer for counting
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <asm/ioctl.h>
+
+//at least 2 cpu
+static enum hrtimer_restart timer_fn_twin_a(struct hrtimer *timer_data)
+{
+ sched_memqos_task_collect_data_range(0, nr_cpu_ids / 2 - 1);
+ hrtimer_forward_now(timer_data, 1 * NSEC_PER_MSEC);
+ return HRTIMER_RESTART;
+}
+
+static enum hrtimer_restart timer_fn_twin_b(struct hrtimer *timer_data)
+{
+ sched_memqos_task_collect_data_range(nr_cpu_ids / 2, nr_cpu_ids - 1);
+ hrtimer_forward_now(timer_data, 1 * NSEC_PER_MSEC);
+ return HRTIMER_RESTART;
+}
+
+static struct hrtimer timer_twin_a;
+static struct hrtimer timer_twin_b;
+
+static void memqos_timer_init_func_a(void *info) {
+ hrtimer_init(&timer_twin_a, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ timer_twin_a.function = timer_fn_twin_a;
+ hrtimer_start(&timer_twin_a, ktime_add_ns(ktime_get(), 10000000), HRTIMER_MODE_ABS);
+}
+
+static void memqos_timer_init_func_b(void *info) {
+ hrtimer_init(&timer_twin_b, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ timer_twin_b.function = timer_fn_twin_b;
+ hrtimer_start(&timer_twin_b, ktime_add_ns(ktime_get(), 10000000), HRTIMER_MODE_ABS);
+}
+
+static void memqos_timer_init_a(void)
+{
+ smp_call_function_single(0, memqos_timer_init_func_b, NULL, 0);
+}
+
+static void memqos_timer_init_b(void)
+{
+ smp_call_function_single(nr_cpu_ids / 2, memqos_timer_init_func_a, NULL, 0);
+}
+
+static void memqos_timer_twin_init(void) {
+ memqos_timer_init_a();
+ memqos_timer_init_b();
+}
+
+static void memqos_timer_twin_exit(void) {
+ hrtimer_cancel(&timer_twin_a);
+ hrtimer_cancel(&timer_twin_b);
+}
+
+DEFINE_STATIC_KEY_FALSE(sched_phase);
+DEFINE_STATIC_KEY_FALSE(sched_phase_printk);
+
+static int set_phase_state(bool enabled)
+{
+ int err;
+ int state = static_branch_likely(&sched_phase);
+
+ if (enabled == state) {
+ pr_warn("phase has already %s\n", state ? "enabled" : "disabled");
+ return 0;
+ }
+
+ if (enabled) {
+ err = phase_perf_create();
+ if (err) {
+ pr_err("phase enable failed\n");
+ return err;
+ }
+ static_branch_enable(&sched_phase);
+ pr_info("phase enabled\n");
+ memqos_timer_twin_init();
+ } else {
+ static_branch_disable(&sched_phase);
+ phase_perf_release();
+ pr_info("phase disabled\n");
+ memqos_timer_twin_exit();
+ }
+
+ return 0;
+}
+
+/*
+ * the other procfs files of phase cannot be modified if sched_phase is already enabled
+ */
+static int phase_proc_state(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ int err;
+ int state = static_branch_likely(&sched_phase);
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &state;
+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+ if (write)
+ err = set_phase_state(state);
+
+ return err;
+}
+
+static int set_phase_state_printk(bool enabled)
+{
+ if (enabled) {
+ static_branch_enable(&sched_phase_printk);
+ } else {
+ static_branch_disable(&sched_phase_printk);
+ }
+
+ return 0;
+}
+
+/*
+ * the other procfs files of phase cannot be modified if sched_phase is already enabled
+ */
+static int phase_proc_state_printk(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ int err;
+ int state = static_branch_likely(&sched_phase);
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &state;
+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+ if (write)
+ err = set_phase_state_printk(state);
+
+ return err;
+}
+
+
+static int __maybe_unused zero;
+static int __maybe_unused one = 1;
+
+struct ctl_table phase_table[] = {
+ {
+ .procname = "enabled",
+ .data = NULL,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = phase_proc_state,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
+ .procname = "trace_enabled",
+ .data = NULL,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = phase_proc_state_printk,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ { }
+};
+#endif /* CONFIG_PROC_SYSCTL */
diff --git a/kernel/sched/memqos/phase_memband.c b/kernel/sched/memqos/phase_memband.c
new file mode 100644
index 0000000000000..df8b2811f6ab7
--- /dev/null
+++ b/kernel/sched/memqos/phase_memband.c
@@ -0,0 +1,179 @@
+#include <linux/types.h>
+#include <linux/cpu.h>
+#include <linux/memqos.h>
+
+#include <asm/cpu.h>
+#include <asm/cputype.h>
+#include <asm/cpufeature.h>
+#include <asm/mpam_sched.h>
+
+static const int nr_partid = 15;
+static const int nr_monitor = 4;
+
+static LIST_HEAD(phase_mpam_waitqueue);
+
+//mpam_profile_res[0] not used
+struct memqos_mpam_profile mpam_profile_res[16] = {
+ { .partid = 0, .monitor = 0, .used = 1},
+ { .partid = 1, .monitor = 0,},
+ { .partid = 2, .monitor = 1,},
+ { .partid = 3, .monitor = 2,},
+ { .partid = 4, .monitor = 3,},
+ { .partid = 5, .monitor = 0,},
+ { .partid = 6, .monitor = 1,},
+ { .partid = 7, .monitor = 2,},
+ { .partid = 8, .monitor = 3,},
+ { .partid = 9, .monitor = 0,},
+ { .partid = 10, .monitor = 1,},
+ { .partid = 11, .monitor = 2,},
+ { .partid = 12, .monitor = 3,},
+ { .partid = 13, .monitor = 0,},
+ { .partid = 14, .monitor = 1,},
+ { .partid = 15, .monitor = 2,},
+};
+
+static DEFINE_SPINLOCK(phase_partid_lock);
+
+void phase_update_mpam_label(struct task_struct *tsk)
+{
+ int i = 0;
+ //unsigned long flag;
+
+ WARN_ON_ONCE(tsk->closid);
+
+ if (tsk->sched_memqos.corrupt) {
+ phase_release_mpam_label(tsk);
+ return;
+ }
+
+ spin_lock(&phase_partid_lock);
+ if (tsk->sched_memqos.mpam_profile.profile != &mpam_profile_res[0] &&
+ tsk->sched_memqos.mpam_profile.profile != NULL) {
+ tsk->closid = tsk->sched_memqos.mpam_profile.profile->partid;
+ tsk->sched_memqos.mpam_profile.profile->tsk = tsk;
+ //tsk->sched_memqos.mpam_profile.profile->used = 1;
+ tsk->rmid = 0;
+ spin_unlock(&phase_partid_lock);
+ //if (static_branch_unlikely(&sched_phase_printk)) {
+ // trace_printk("task pid:%d get partid%d succeed\n", tsk->pid, tsk->closid);
+ //}
+ __mpam_sched_in_v2(tsk);
+ return;
+ }
+
+ //is in profile queue, wait...
+ if (tsk->sched_memqos.mpam_profile.profile == &mpam_profile_res[0]) {
+ spin_unlock(&phase_partid_lock);
+ return;
+ }
+
+ for (i = 1; i < 16; i++) {
+ if (mpam_profile_res[i].used) {
+ if (static_branch_unlikely(&sched_phase_printk)) {
+ //if (mpam_profile_res[i].tsk)
+ // trace_printk("i%d want get partid, butpartid:%d get by pid:%d closid:%d\n",
+ //tsk->pid, i, mpam_profile_res[i].tsk->pid, mpam_profile_res[i].tsk->closid);
+ //else
+ // trace_printk("i%d want get partid, butpartid:%d get by pid:%d(NULL)\n",
+ //tsk->pid, i, tsk->pid);
+ }
+
+ continue;
+ }
+
+ tsk->sched_memqos.mpam_profile.profile = NULL;
+ break;
+ }
+
+ if (i == 16) {
+ list_move_tail(&tsk->sched_memqos.mpam_profile.wait_list, &phase_mpam_waitqueue);
+ tsk->sched_memqos.mpam_profile.profile = &mpam_profile_res[0];
+ spin_unlock(&phase_partid_lock);
+ //if (static_branch_unlikely(&sched_phase_printk)) {
+ // trace_printk("task pid:%d no partid found, go to list\n", tsk->pid);
+ //}
+ //wait...
+ return;
+ }
+
+ mpam_profile_res[i].used = 1;
+ tsk->closid = mpam_profile_res[i].partid;
+ mpam_profile_res[i].tsk = tsk;
+ tsk->sched_memqos.mpam_profile.profile = &mpam_profile_res[i];
+ tsk->rmid = 0;
+ spin_unlock(&phase_partid_lock);
+ //if (static_branch_unlikely(&sched_phase_printk)) {
+ //trace_printk("task pid:%d get partid%d succeed\n", tsk->pid, i);
+ //}
+
+ __mpam_sched_in_v2(tsk);
+}
+
+static void phase_release_mpam_label_without_lock(struct task_struct *tsk)
+{
+ int closid;
+ struct memqos_wait_profile *next;
+
+ //assert locked
+
+ if (tsk->sched_memqos.mpam_profile.profile &&
+ tsk->sched_memqos.mpam_profile.profile->partid) {
+ closid = tsk->sched_memqos.mpam_profile.profile->partid;
+ } else if (tsk->closid == 0) {
+ return;
+ } else {
+ closid = tsk->closid;
+ }
+
+ tsk->closid = 0;
+ tsk->sched_memqos.mpam_profile.profile = NULL;
+ mpam_profile_res[closid].used = 0;
+ mpam_profile_res[closid].tsk = NULL;
+
+ //if (static_branch_unlikely(&sched_phase_printk)) {
+ // trace_printk("task pid:%d release partid%d, list empty:%d\n", tsk->pid, closid, list_empty(&phase_mpam_waitqueue));
+ //}
+
+ next = list_first_entry_or_null(&phase_mpam_waitqueue, struct memqos_wait_profile, wait_list);
+ if (next) {
+ list_del_init(&next->wait_list);
+ mpam_profile_res[closid].used = 1;
+ next->profile = &mpam_profile_res[closid];
+ }
+
+ return;
+}
+
+//task shutdown
+void phase_destroy_waitqueue(struct task_struct *tsk)
+{
+ spin_lock(&phase_partid_lock);
+
+ //if (tsk->sched_memqos.mpam_profile.profile == &mpam_profile_res[0]) {
+ list_del_init(&tsk->sched_memqos.mpam_profile.wait_list);
+ //} else {
+ phase_release_mpam_label_without_lock(tsk);
+ //}
+ spin_unlock(&phase_partid_lock);
+}
+
+void phase_release_mpam_label(struct task_struct *tsk)
+{
+ spin_lock(&phase_partid_lock);
+ phase_release_mpam_label_without_lock(tsk);
+ spin_unlock(&phase_partid_lock);
+}
+
+#include <asm/mpam.h>
+void phase_get_memband(struct memqos_mpam_profile *pm, int *result, int nr)
+{
+ if (pm == &mpam_profile_res[0] || pm == NULL) {
+ result[0] = 0;
+ result[1] = 0;
+ result[2] = 0;
+ result[3] = 0;
+ return;
+ }
+
+ mpam_component_config_mbwu_mon(pm->partid, pm->pmg, pm->monitor, result, nr);
+}
diff --git a/kernel/sched/memqos/phase_perf.c b/kernel/sched/memqos/phase_perf.c
new file mode 100644
index 0000000000000..7b7f37e46f76c
--- /dev/null
+++ b/kernel/sched/memqos/phase_perf.c
@@ -0,0 +1,412 @@
+#include <linux/kernel.h>
+#include <linux/perf_event.h>
+#include <linux/percpu-defs.h>
+#include <linux/slab.h>
+#include <linux/stop_machine.h>
+#include <linux/memqos.h>
+#include <linux/sched.h>
+
+#define PHASE_FEVENT_NUM 3
+
+int *phase_perf_pevents = NULL;
+
+static DEFINE_PER_CPU(__typeof__(struct perf_event *)[PHASE_PEVENT_NUM], cpu_phase_perf_events);
+
+/******************************************
+ * Helpers for phase perf event
+ *****************************************/
+static inline struct perf_event *perf_event_of_cpu(int cpu, int index)
+{
+ return per_cpu(cpu_phase_perf_events, cpu)[index];
+}
+
+static inline struct perf_event **perf_events_of_cpu(int cpu)
+{
+ return per_cpu(cpu_phase_perf_events, cpu);
+}
+
+static inline u64 perf_event_local_pmu_read(struct perf_event *event)
+{
+ return 0;
+ if (event->state == PERF_EVENT_STATE_ACTIVE)
+ event->pmu->read(event);
+ return local64_read(&event->count);
+}
+
+/******************************************
+ * Helpers for cpu counters
+ *****************************************/
+static inline u64 read_cpu_counter(int cpu, int index)
+{
+ struct perf_event *event = perf_event_of_cpu(cpu, index);
+
+ if (!event || !event->pmu)
+ return 0;
+
+ return perf_event_local_pmu_read(event);
+}
+
+static struct perf_event_attr *alloc_attr(int event_id)
+{
+ struct perf_event_attr *attr;
+
+ attr = kzalloc(sizeof(struct perf_event_attr), GFP_KERNEL);
+ if (!attr)
+ return ERR_PTR(-ENOMEM);
+
+ attr->type = PERF_TYPE_RAW;
+ attr->config = event_id;
+ attr->size = sizeof(struct perf_event_attr);
+ attr->pinned = 1;
+ attr->disabled = 1;
+ //attr->exclude_hv;
+ //attr->exclude_idle;
+ //attr->exclude_kernel;
+
+ return attr;
+}
+
+static int create_cpu_counter(int cpu, int event_id, int index)
+{
+ struct perf_event_attr *attr = NULL;
+ struct perf_event **events = perf_events_of_cpu(cpu);
+ struct perf_event *event = NULL;
+
+ return 0;
+ attr = alloc_attr(event_id);
+ if (IS_ERR(attr))
+ return PTR_ERR(attr);
+
+ event = perf_event_create_kernel_counter(attr, cpu, NULL, NULL, NULL);
+ if (IS_ERR(event)) {
+ pr_err("unable to create perf event (cpu:%i-type:%d-pinned:%d-config:0x%llx) : %ld",
+ cpu, attr->type, attr->pinned, attr->config, PTR_ERR(event));
+ kfree(attr);
+ return PTR_ERR(event);
+ } else {
+ events[index] = event;
+ perf_event_enable(events[index]);
+ if (event->hw.idx == -1) {
+ pr_err("pinned event unable to get onto hardware, perf event (cpu:%i-type:%d-config:0x%llx)",
+ cpu, attr->type, attr->config);
+ kfree(attr);
+ return -EINVAL;
+ }
+ pr_info("create perf_event (cpu:%i-idx:%d-type:%d-pinned:%d-exclude_hv:%d"
+ "-exclude_idle:%d-exclude_kernel:%d-config:0x%llx-addr:%px)",
+ event->cpu, event->hw.idx,
+ event->attr.type, event->attr.pinned, event->attr.exclude_hv,
+ event->attr.exclude_idle, event->attr.exclude_kernel,
+ event->attr.config, event);
+ }
+
+ kfree(attr);
+ return 0;
+}
+
+static int release_cpu_counter(int cpu, int event_id, int index)
+{
+ struct perf_event **events = perf_events_of_cpu(cpu);
+ struct perf_event *event = NULL;
+
+ return 0;
+ event = events[index];
+
+ if (!event)
+ return 0;
+
+ pr_info("release perf_event (cpu:%i-idx:%d-type:%d-pinned:%d-exclude_hv:%d"
+ "-exclude_idle:%d-exclude_kernel:%d-config:0x%llx)",
+ event->cpu, event->hw.idx,
+ event->attr.type, event->attr.pinned, event->attr.exclude_hv,
+ event->attr.exclude_idle, event->attr.exclude_kernel,
+ event->attr.config);
+
+ perf_event_release_kernel(event);
+ events[index] = NULL;
+
+ return 0;
+}
+
+enum {
+ CYCLES_INDEX = 0,
+ INST_RETIRED_INDEX,
+ PHASE_EVENT_FINAL_TERMINATOR
+};
+
+#define CYCLES 0x0011
+#define INST_RETIRED 0x0008
+
+static int pevents[PHASE_PEVENT_NUM] = {
+ CYCLES,
+ INST_RETIRED,
+ PHASE_EVENT_FINAL_TERMINATOR,
+};
+
+#define for_each_phase_pevents(index, events) \
+ for (index = 0; events != NULL && index < PHASE_PEVENT_NUM && \
+ events[index] != PHASE_EVENT_FINAL_TERMINATOR; index++)
+
+
+/******************************************
+ * Helpers for phase perf
+ *****************************************/
+static int do_pevents(int (*fn)(int, int, int), int cpu)
+{
+ int index;
+ int err;
+
+ for_each_phase_pevents(index, phase_perf_pevents) {
+ err = fn(cpu, phase_perf_pevents[index], index);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int __phase_perf_create(void *args)
+{
+ int err;
+ int cpu = raw_smp_processor_id();
+
+ /* create pinned events */
+ pr_info("create pinned events\n");
+ err = do_pevents(create_cpu_counter, cpu);
+ if (err) {
+ pr_err("create pinned events failed\n");
+ do_pevents(release_cpu_counter, cpu);
+ return err;
+ }
+
+ pr_info("[%d] phase class event create success\n", cpu);
+ return 0;
+}
+
+static int do_phase_perf_create(int *pevents, const struct cpumask *cpus)
+{
+ phase_perf_pevents = pevents;
+ return stop_machine(__phase_perf_create, NULL, cpus);
+}
+
+static int __do_phase_perf_release(void *args)
+{
+ int cpu = raw_smp_processor_id();
+
+ /* release pinned events */
+ pr_info("release pinned events\n");
+ do_pevents(release_cpu_counter, cpu);
+
+ pr_info("[%d] phase class event release success\n", cpu);
+ return 0;
+}
+
+static void do_phase_perf_release(const struct cpumask *cpus)
+{
+ stop_machine(__do_phase_perf_release, NULL, cpus);
+ phase_perf_pevents = NULL;
+}
+
+int phase_perf_create(void)
+{
+ return do_phase_perf_create(pevents, cpu_possible_mask);
+}
+
+void phase_perf_release(void)
+{
+ do_phase_perf_release(cpu_possible_mask);
+}
+
+DECLARE_STATIC_KEY_FALSE(sched_phase);
+DECLARE_STATIC_KEY_FALSE(sched_phase_printk);
+
+#define PHASE_EVENT_OVERFLOW (~0ULL)
+
+static inline u64 phase_event_count_sub(u64 curr, u64 prev)
+{
+ if (curr < prev) { /* ovewrflow */
+ u64 tmp = PHASE_EVENT_OVERFLOW - prev;
+ return curr + tmp;
+ } else {
+ return curr - prev;
+ }
+}
+
+static inline void phase_calc_delta(struct task_struct *p,
+ struct phase_event_count *prev,
+ struct phase_event_count *curr,
+ struct phase_event_count *delta)
+{
+ int *pevents = phase_perf_pevents;
+ int index;
+
+ for_each_phase_pevents(index, pevents) {
+ delta->pcount.data[index] = phase_event_count_sub(curr->pcount.data[index], prev->pcount.data[index]);
+ }
+}
+
+static inline u64 phase_data_of_pevent(struct phase_event_pcount *counter, int event_id)
+{
+ int index;
+ int *events = phase_perf_pevents;
+
+ for_each_phase_pevents(index, events) {
+ if (event_id == events[index])
+ return counter->data[index];
+ }
+
+ return 0;
+}
+
+static int cal_ring_history_average(int *history, int nr, int s_pos, int c_nr)
+{
+ int average = 0;
+ int start = ((s_pos - c_nr) + nr) % nr;
+
+ if (start < 0)
+ return 0;
+
+ for (;start != s_pos;) {
+ if (history[start] == 0) {
+ c_nr--;
+ if (c_nr == 0)
+ return 0;
+ continue;
+ }
+ average += history[start];
+ start = (start + 1) % nr;
+ }
+
+ return start / c_nr;
+}
+
+static void __phase_cal_ipcx10(struct task_struct *p, struct phase_event_count *delta)
+{
+ u64 ins;
+ u64 cycles;
+ //invalid zero
+ int ipcx10 = 0;
+
+ ins = phase_data_of_pevent(&delta->pcount, INST_RETIRED_INDEX);
+ cycles = phase_data_of_pevent(&delta->pcount, CYCLES_INDEX);
+
+ if (cycles)
+ ipcx10 = (ins * 10) / cycles;
+
+ //if (static_branch_unlikely(&sched_phase_printk)) {
+ // trace_printk("ins:%lld cycles:%lld\n", ins, cycles);
+ //}
+
+ p->sched_memqos.ipcx10_history[p->sched_memqos.ipc_ringpos] = ipcx10;
+ p->sched_memqos.ipc_ringpos = (p->sched_memqos.ipc_ringpos + 1) % 10;
+ cal_ring_history_average(p->sched_memqos.ipcx10_history, 10, p->sched_memqos.ipc_ringpos, 5);
+}
+
+static void __phase_cal_memband_div_10(struct task_struct *p)
+{
+ int pos;
+ int result[4];
+
+ pos = p->sched_memqos.memband_ringpos;
+
+ phase_get_memband(p->sched_memqos.mpam_profile.profile, result, 4);
+
+ //if (static_branch_unlikely(&sched_phase_printk)) {
+ // trace_printk("memband:%d %d %d %d profile:%llx\n", result[0], result[1], result[2], result[3], p->sched_memqos.mpam_profile.profile);
+ //}
+
+ p->sched_memqos.memband_div_10_total[0] = p->sched_memqos.memband_div_10_total[0] - p->sched_memqos.memband_div_10_history[0][pos];
+ p->sched_memqos.memband_div_10_total[0] = p->sched_memqos.memband_div_10_total[0] + result[0] / 10;
+ p->sched_memqos.memband_div_10_history[0][p->sched_memqos.memband_ringpos] = result[0] / 10;
+
+ p->sched_memqos.memband_div_10_total[1] = p->sched_memqos.memband_div_10_total[1] - p->sched_memqos.memband_div_10_history[1][pos];
+ p->sched_memqos.memband_div_10_total[1] = p->sched_memqos.memband_div_10_total[1] + result[1] / 10;
+ p->sched_memqos.memband_div_10_history[1][p->sched_memqos.memband_ringpos] = result[1] / 10;
+
+ p->sched_memqos.memband_div_10_total[2] = p->sched_memqos.memband_div_10_total[2] - p->sched_memqos.memband_div_10_history[2][pos];
+ p->sched_memqos.memband_div_10_total[2] = p->sched_memqos.memband_div_10_total[2] + result[2] / 10;
+ p->sched_memqos.memband_div_10_history[2][p->sched_memqos.memband_ringpos] = result[2] / 10;
+
+ p->sched_memqos.memband_div_10_total[3] = p->sched_memqos.memband_div_10_total[3] - p->sched_memqos.memband_div_10_history[3][pos];
+ p->sched_memqos.memband_div_10_total[3] = p->sched_memqos.memband_div_10_total[3] + result[3] / 10;
+ p->sched_memqos.memband_div_10_history[3][p->sched_memqos.memband_ringpos] = result[3] / 10;
+
+ p->sched_memqos.memband_ringpos = (pos + 1) % 10;
+
+ cal_ring_history_average(p->sched_memqos.memband_div_10_history[0], 10, pos, 5);
+ cal_ring_history_average(p->sched_memqos.memband_div_10_history[1], 10, pos, 5);
+ cal_ring_history_average(p->sched_memqos.memband_div_10_history[2], 10, pos, 5);
+ cal_ring_history_average(p->sched_memqos.memband_div_10_history[3], 10, pos, 5);
+}
+
+static DEFINE_PER_CPU(struct phase_event_count, prev_phase_event_count);
+static DEFINE_PER_CPU(struct phase_event_count, curr_phase_event_count);
+
+static void phase_perf_read_events(int cpu, u64 *pdata)
+{
+ int index;
+
+ for_each_phase_pevents(index, phase_perf_pevents) {
+ pdata[index] = read_cpu_counter(cpu, index);
+ }
+}
+
+static inline struct phase_event_count *phase_read_prev(unsigned int cpu)
+{
+ return &per_cpu(prev_phase_event_count, cpu);
+}
+
+static inline struct phase_event_count *phase_read_curr(unsigned int cpu)
+{
+ struct phase_event_count *curr = &per_cpu(curr_phase_event_count, cpu);
+
+ phase_perf_read_events(cpu, curr->pcount.data);
+
+ return curr;
+}
+
+void phase_account_task(struct task_struct *p, int cpu)
+{
+ struct phase_event_count delta;
+ struct phase_event_count *prev, *curr;
+
+ if (!static_branch_likely(&sched_phase))
+ return;
+
+ //if (!sched_core_enabled(cpu_rq(cpu)))
+ // return;
+
+ /* update phase_event_count */
+ prev = phase_read_prev(cpu);
+ curr = phase_read_curr(cpu);
+ phase_calc_delta(p, prev, curr, &delta);
+ *prev = *curr;
+
+ /* calculate phase */
+ __phase_cal_ipcx10(p, &delta);
+ __phase_cal_memband_div_10(p);
+ p->sched_memqos.sample_times++;
+ if ((p->sched_memqos.sample_times % 3) == 0)
+ p->sched_memqos.account_ready = 1;
+}
+
+
+void phase_trace_printk(struct task_struct *p)
+{
+ if (!static_branch_unlikely(&sched_phase_printk))
+ return;
+
+ trace_printk("p->comm:%s(%d) ipcpos:%d ipcx10:%d membandpos:%d memband_div_10:%d numa_score[0]:%d numa_score[1]:%d numa_score[2]:%d numa_score[3]:%d turbo:%d prefered_nid:%d classid:%d partid:%d\n",
+ p->comm, p->pid, p->sched_memqos.ipc_ringpos,\
+ p->sched_memqos.ipcx10, \
+ p->sched_memqos.memband_ringpos,\
+ p->sched_memqos.memband_div_10, \
+ p->sched_memqos.numa_score[0], \
+ p->sched_memqos.numa_score[1], \
+ p->sched_memqos.numa_score[2], \
+ p->sched_memqos.numa_score[3], \
+ p->sched_memqos.turbo, \
+ p->sched_memqos.preferred_nid, \
+ p->sched_memqos.class_id, \
+ p->closid);
+}
diff --git a/kernel/sched/memqos/phase_sim_knn.c b/kernel/sched/memqos/phase_sim_knn.c
new file mode 100644
index 0000000000000..b80bb6b9ae0a3
--- /dev/null
+++ b/kernel/sched/memqos/phase_sim_knn.c
@@ -0,0 +1,92 @@
+#include <linux/types.h>
+
+#define DATA_ROW 20
+void QuickSort(u64 arr[DATA_ROW][2], int L, int R) {
+ int i = L;
+ int j = R;
+ int kk = (L + R) / 2;
+ u64 pivot = arr[kk][0];
+
+ while (i <= j) {
+ while (pivot > arr[i][0]) {
+ i++;
+ }
+ while (pivot < arr[j][0]) {
+ j--;
+ }
+ if (i <= j) {
+ u64 temp = arr[i][0];
+
+ arr[i][0] = arr[j][0];
+ arr[j][0] = temp;
+ i++; j--;
+ }
+ }
+ if (L < j) {
+ QuickSort(arr, L, j);
+ }
+ if (i < R) {
+ QuickSort(arr, i, R);
+ }
+}
+
+u64 euclidean_distance(u64 *row1, u64 *row2, int col) {
+ u64 distance = 0;
+ int i;
+
+ for (i = 0; i < col - 1; i++) {
+ distance += ((row1[i] - row2[i]) * (row1[i] - row2[i]));
+ }
+ return distance;
+}
+
+#define num_neighbors 6
+#define MAX_TAG 8
+
+int get_neighbors_tag(u64 train_data[DATA_ROW][3], int train_row, int col, u64 *test_row) {
+ int i;
+ u64 neighbors[MAX_TAG] = {0};
+ int max_tag = 0;
+ u64 distances[DATA_ROW][2];
+
+ for (i = 0; i < train_row; i++) {
+ distances[i][0] = euclidean_distance(train_data[i], test_row, col);
+ distances[i][1] = train_data[i][col - 1];
+ }
+ QuickSort(distances, 0, train_row - 1);
+ for (i = 0; i < num_neighbors; i++) {
+ neighbors[distances[i][1]]++;
+ if (neighbors[distances[i][1]] > neighbors[max_tag])
+ max_tag = distances[i][1];
+ }
+ return max_tag;
+}
+
+static u64 train_data[DATA_ROW][3] = {
+ {0, 1, 0},
+ {0, 9, 0},
+ {0, 20, 1},
+ {0, 30, 1},
+ {0, 40, 2},
+ {0, 50, 3},
+ {0, 60, 3},
+ {0, 70, 3},
+ {0, 80, 4},
+ {0, 90, 4},
+ {0, 100, 4},
+ {0, 110, 5},
+ {0, 120, 5},
+ {0, 130, 6},
+ {0, 140, 6},
+ {0, 150, 7},
+};
+
+int knn_get_tag(int ipcx10, int memband_div_10)
+{
+ u64 test_data[2];
+
+ test_data[0] = ipcx10;
+ test_data[1] = memband_div_10;
+
+ return get_neighbors_tag(train_data, DATA_ROW, 3, test_data);
+}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 685f9881b8e23..0d2764c4449ce 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -465,6 +465,13 @@ static struct ctl_table kern_table[] = {
.extra2 = &one,
},
#endif /* CONFIG_NUMA_BALANCING */
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+ {
+ .procname = "phase",
+ .mode = 0555,
+ .child = phase_table,
+ },
+#endif
#endif /* CONFIG_SCHED_DEBUG */
{
.procname = "sched_rt_period_us",
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4cac46d56f387..d748c291e7047 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2164,12 +2164,15 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
{
struct mempolicy *pol;
struct page *page;
- int preferred_nid;
+ int preferred_nid = -1;
nodemask_t *nmask;
+ if (current->sched_memqos.preferred_nid)
+ preferred_nid = current->sched_memqos.preferred_nid - 1;
+
pol = get_vma_policy(vma, addr);
- if (pol->mode == MPOL_INTERLEAVE) {
+ if (pol->mode == MPOL_INTERLEAVE && preferred_nid != -1) {
unsigned nid;
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
@@ -2233,7 +2236,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
}
nmask = policy_nodemask(gfp, pol);
- preferred_nid = policy_node(gfp, pol, node);
+ if (preferred_nid == -1)
+ preferred_nid = policy_node(gfp, pol, node);
page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
mark_vma_cdm(nmask, page, vma);
mpol_cond_put(pol);
--
2.25.1
1
0

23 Mar '23
From: Yixing Liu <liuyixing1(a)huawei.com>
mainline inclusion
form mainline-master
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I6K9B6
CVE: NA
Reference: https://patchwork.kernel.org/project/linux-rdma/cover/20230304091555.224129…
---------------------------------------------------------------
The current resource query for vf caps is driven
by the driver, which is unreasonable. This patch
adds a new command HNS_ROCE_OPC_QUERY_PF_CAPS_NUM
to support obtaining vf caps information from firmware.
Signed-off-by: Yixing Liu <liuyixing1(a)huawei.com>
Signed-off-by: Haoyue Xu <xuhaoyue1(a)hisilicon.com>
Reviewed-by: Yangyang Li <liyangyang20(a)huawei.com>
---
drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 203 +++++++--------------
drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 33 +---
2 files changed, 64 insertions(+), 172 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index c9826a010f38..cba78f0eac14 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -2157,102 +2157,6 @@ static int hns_roce_v2_set_bt(struct hns_roce_dev *hr_dev)
return hns_roce_cmq_send(hr_dev, &desc, 1);
}
-/* Use default caps when hns_roce_query_pf_caps() failed or init VF profile */
-static void set_default_caps(struct hns_roce_dev *hr_dev)
-{
- struct hns_roce_caps *caps = &hr_dev->caps;
-
- caps->num_qps = HNS_ROCE_V2_MAX_QP_NUM;
- caps->max_wqes = HNS_ROCE_V2_MAX_WQE_NUM;
- caps->num_cqs = HNS_ROCE_V2_MAX_CQ_NUM;
- caps->num_srqs = HNS_ROCE_V2_MAX_SRQ_NUM;
- caps->min_cqes = HNS_ROCE_MIN_CQE_NUM;
- caps->max_cqes = HNS_ROCE_V2_MAX_CQE_NUM;
- caps->max_sq_sg = HNS_ROCE_V2_MAX_SQ_SGE_NUM;
- caps->max_extend_sg = HNS_ROCE_V2_MAX_EXTEND_SGE_NUM;
- caps->max_rq_sg = HNS_ROCE_V2_MAX_RQ_SGE_NUM;
-
- caps->num_uars = HNS_ROCE_V2_UAR_NUM;
- caps->phy_num_uars = HNS_ROCE_V2_PHY_UAR_NUM;
- caps->num_aeq_vectors = HNS_ROCE_V2_AEQE_VEC_NUM;
- caps->num_other_vectors = HNS_ROCE_V2_ABNORMAL_VEC_NUM;
- caps->num_comp_vectors = 0;
-
- caps->num_mtpts = HNS_ROCE_V2_MAX_MTPT_NUM;
- caps->num_pds = HNS_ROCE_V2_MAX_PD_NUM;
- caps->num_qpc_timer = HNS_ROCE_V2_MAX_QPC_TIMER_NUM;
- caps->cqc_timer_bt_num = HNS_ROCE_V2_MAX_CQC_TIMER_BT_NUM;
-
- caps->max_qp_init_rdma = HNS_ROCE_V2_MAX_QP_INIT_RDMA;
- caps->max_qp_dest_rdma = HNS_ROCE_V2_MAX_QP_DEST_RDMA;
- caps->max_sq_desc_sz = HNS_ROCE_V2_MAX_SQ_DESC_SZ;
- caps->max_rq_desc_sz = HNS_ROCE_V2_MAX_RQ_DESC_SZ;
- caps->max_srq_desc_sz = HNS_ROCE_V2_MAX_SRQ_DESC_SZ;
- caps->irrl_entry_sz = HNS_ROCE_V2_IRRL_ENTRY_SZ;
- caps->trrl_entry_sz = HNS_ROCE_V2_EXT_ATOMIC_TRRL_ENTRY_SZ;
- caps->cqc_entry_sz = HNS_ROCE_V2_CQC_ENTRY_SZ;
- caps->srqc_entry_sz = HNS_ROCE_V2_SRQC_ENTRY_SZ;
- caps->mtpt_entry_sz = HNS_ROCE_V2_MTPT_ENTRY_SZ;
- caps->idx_entry_sz = HNS_ROCE_V2_IDX_ENTRY_SZ;
- caps->page_size_cap = HNS_ROCE_V2_PAGE_SIZE_SUPPORTED;
- caps->reserved_lkey = 0;
- caps->reserved_pds = 0;
- caps->reserved_mrws = 1;
- caps->reserved_uars = 0;
- caps->reserved_cqs = 0;
- caps->reserved_srqs = 0;
- caps->reserved_qps = HNS_ROCE_V2_RSV_QPS;
-
- caps->qpc_hop_num = HNS_ROCE_CONTEXT_HOP_NUM;
- caps->srqc_hop_num = HNS_ROCE_CONTEXT_HOP_NUM;
- caps->cqc_hop_num = HNS_ROCE_CONTEXT_HOP_NUM;
- caps->mpt_hop_num = HNS_ROCE_CONTEXT_HOP_NUM;
- caps->sccc_hop_num = HNS_ROCE_SCCC_HOP_NUM;
-
- caps->mtt_hop_num = HNS_ROCE_MTT_HOP_NUM;
- caps->wqe_sq_hop_num = HNS_ROCE_SQWQE_HOP_NUM;
- caps->wqe_sge_hop_num = HNS_ROCE_EXT_SGE_HOP_NUM;
- caps->wqe_rq_hop_num = HNS_ROCE_RQWQE_HOP_NUM;
- caps->cqe_hop_num = HNS_ROCE_CQE_HOP_NUM;
- caps->srqwqe_hop_num = HNS_ROCE_SRQWQE_HOP_NUM;
- caps->idx_hop_num = HNS_ROCE_IDX_HOP_NUM;
- caps->chunk_sz = HNS_ROCE_V2_TABLE_CHUNK_SIZE;
-
- caps->flags = HNS_ROCE_CAP_FLAG_REREG_MR |
- HNS_ROCE_CAP_FLAG_ROCE_V1_V2 |
- HNS_ROCE_CAP_FLAG_CQ_RECORD_DB |
- HNS_ROCE_CAP_FLAG_QP_RECORD_DB;
-
- caps->pkey_table_len[0] = 1;
- caps->ceqe_depth = HNS_ROCE_V2_COMP_EQE_NUM;
- caps->aeqe_depth = HNS_ROCE_V2_ASYNC_EQE_NUM;
- caps->local_ca_ack_delay = 0;
- caps->max_mtu = IB_MTU_4096;
-
- caps->max_srq_wrs = HNS_ROCE_V2_MAX_SRQ_WR;
- caps->max_srq_sges = HNS_ROCE_V2_MAX_SRQ_SGE;
-
- caps->flags |= HNS_ROCE_CAP_FLAG_ATOMIC | HNS_ROCE_CAP_FLAG_MW |
- HNS_ROCE_CAP_FLAG_SRQ | HNS_ROCE_CAP_FLAG_FRMR |
- HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL | HNS_ROCE_CAP_FLAG_XRC;
-
- caps->gid_table_len[0] = HNS_ROCE_V2_GID_INDEX_NUM;
-
- if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) {
- caps->flags |= HNS_ROCE_CAP_FLAG_STASH |
- HNS_ROCE_CAP_FLAG_DIRECT_WQE |
- HNS_ROCE_CAP_FLAG_DCA_MODE;
- caps->max_sq_inline = HNS_ROCE_V3_MAX_SQ_INLINE;
- } else {
- caps->max_sq_inline = HNS_ROCE_V2_MAX_SQ_INLINE;
-
- /* The following configuration are only valid for HIP08 */
- caps->qpc_sz = HNS_ROCE_V2_QPC_SZ;
- caps->sccc_sz = HNS_ROCE_V2_SCCC_SZ;
- caps->cqe_sz = HNS_ROCE_V2_CQE_SIZE;
- }
-}
-
static void calc_pg_sz(u32 obj_num, u32 obj_size, u32 hop_num, u32 ctx_bt_num,
u32 *buf_page_size, u32 *bt_page_size, u32 hem_type)
{
@@ -2395,7 +2299,8 @@ static void apply_func_caps(struct hns_roce_dev *hr_dev)
if (!caps->num_comp_vectors)
caps->num_comp_vectors = min_t(u32, caps->eqc_bt_num - 1,
- (u32)priv->handle->rinfo.num_vectors - 2);
+ (u32)priv->handle->rinfo.num_vectors -
+ (HNS_ROCE_V2_AEQE_VEC_NUM + HNS_ROCE_V2_ABNORMAL_VEC_NUM));
if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) {
caps->eqe_hop_num = HNS_ROCE_V3_EQE_HOP_NUM;
@@ -2437,7 +2342,7 @@ static void apply_func_caps(struct hns_roce_dev *hr_dev)
set_hem_page_size(hr_dev);
}
-static int hns_roce_query_pf_caps(struct hns_roce_dev *hr_dev)
+static int hns_roce_query_caps(struct hns_roce_dev *hr_dev)
{
struct hns_roce_cmq_desc desc[HNS_ROCE_QUERY_PF_CAPS_CMD_NUM];
struct hns_roce_caps *caps = &hr_dev->caps;
@@ -2446,15 +2351,17 @@ static int hns_roce_query_pf_caps(struct hns_roce_dev *hr_dev)
struct hns_roce_query_pf_caps_c *resp_c;
struct hns_roce_query_pf_caps_d *resp_d;
struct hns_roce_query_pf_caps_e *resp_e;
+ enum hns_roce_opcode_type cmd;
int ctx_hop_num;
int pbl_hop_num;
int ret;
int i;
+ cmd = hr_dev->is_vf ? HNS_ROCE_OPC_QUERY_VF_CAPS_NUM :
+ HNS_ROCE_OPC_QUERY_PF_CAPS_NUM;
+
for (i = 0; i < HNS_ROCE_QUERY_PF_CAPS_CMD_NUM; i++) {
- hns_roce_cmq_setup_basic_desc(&desc[i],
- HNS_ROCE_OPC_QUERY_PF_CAPS_NUM,
- true);
+ hns_roce_cmq_setup_basic_desc(&desc[i], cmd, true);
if (i < (HNS_ROCE_QUERY_PF_CAPS_CMD_NUM - 1))
desc[i].flag |= cpu_to_le16(HNS_ROCE_CMD_FLAG_NEXT);
else
@@ -2471,38 +2378,38 @@ static int hns_roce_query_pf_caps(struct hns_roce_dev *hr_dev)
resp_d = (struct hns_roce_query_pf_caps_d *)desc[3].data;
resp_e = (struct hns_roce_query_pf_caps_e *)desc[4].data;
- caps->local_ca_ack_delay = resp_a->local_ca_ack_delay;
- caps->max_sq_sg = le16_to_cpu(resp_a->max_sq_sg);
- caps->max_sq_inline = le16_to_cpu(resp_a->max_sq_inline);
- caps->max_rq_sg = le16_to_cpu(resp_a->max_rq_sg);
+ caps->local_ca_ack_delay = resp_a->local_ca_ack_delay;
+ caps->max_sq_sg = le16_to_cpu(resp_a->max_sq_sg);
+ caps->max_sq_inline = le16_to_cpu(resp_a->max_sq_inline);
+ caps->max_rq_sg = le16_to_cpu(resp_a->max_rq_sg);
caps->max_rq_sg = roundup_pow_of_two(caps->max_rq_sg);
- caps->max_extend_sg = le32_to_cpu(resp_a->max_extend_sg);
- caps->num_qpc_timer = le16_to_cpu(resp_a->num_qpc_timer);
- caps->max_srq_sges = le16_to_cpu(resp_a->max_srq_sges);
+ caps->max_extend_sg = le32_to_cpu(resp_a->max_extend_sg);
+ caps->num_qpc_timer = le16_to_cpu(resp_a->num_qpc_timer);
+ caps->max_srq_sges = le16_to_cpu(resp_a->max_srq_sges);
caps->max_srq_sges = roundup_pow_of_two(caps->max_srq_sges);
- caps->num_aeq_vectors = resp_a->num_aeq_vectors;
- caps->num_other_vectors = resp_a->num_other_vectors;
- caps->max_sq_desc_sz = resp_a->max_sq_desc_sz;
- caps->max_rq_desc_sz = resp_a->max_rq_desc_sz;
- caps->max_srq_desc_sz = resp_a->max_srq_desc_sz;
- caps->cqe_sz = resp_a->cqe_sz;
-
- caps->mtpt_entry_sz = resp_b->mtpt_entry_sz;
- caps->irrl_entry_sz = resp_b->irrl_entry_sz;
- caps->trrl_entry_sz = resp_b->trrl_entry_sz;
- caps->cqc_entry_sz = resp_b->cqc_entry_sz;
- caps->srqc_entry_sz = resp_b->srqc_entry_sz;
- caps->idx_entry_sz = resp_b->idx_entry_sz;
- caps->sccc_sz = resp_b->sccc_sz;
- caps->max_mtu = resp_b->max_mtu;
- caps->qpc_sz = le16_to_cpu(resp_b->qpc_sz);
- caps->min_cqes = resp_b->min_cqes;
- caps->min_wqes = resp_b->min_wqes;
- caps->page_size_cap = le32_to_cpu(resp_b->page_size_cap);
- caps->pkey_table_len[0] = resp_b->pkey_table_len;
- caps->phy_num_uars = resp_b->phy_num_uars;
- ctx_hop_num = resp_b->ctx_hop_num;
- pbl_hop_num = resp_b->pbl_hop_num;
+ caps->num_aeq_vectors = resp_a->num_aeq_vectors;
+ caps->num_other_vectors = resp_a->num_other_vectors;
+ caps->max_sq_desc_sz = resp_a->max_sq_desc_sz;
+ caps->max_rq_desc_sz = resp_a->max_rq_desc_sz;
+ caps->max_srq_desc_sz = resp_a->max_srq_desc_sz;
+ caps->cqe_sz = resp_a->cqe_sz;
+
+ caps->mtpt_entry_sz = resp_b->mtpt_entry_sz;
+ caps->irrl_entry_sz = resp_b->irrl_entry_sz;
+ caps->trrl_entry_sz = resp_b->trrl_entry_sz;
+ caps->cqc_entry_sz = resp_b->cqc_entry_sz;
+ caps->srqc_entry_sz = resp_b->srqc_entry_sz;
+ caps->idx_entry_sz = resp_b->idx_entry_sz;
+ caps->sccc_sz = resp_b->sccc_sz;
+ caps->max_mtu = resp_b->max_mtu;
+ caps->qpc_sz = le16_to_cpu(resp_b->qpc_sz);
+ caps->min_cqes = resp_b->min_cqes;
+ caps->min_wqes = resp_b->min_wqes;
+ caps->page_size_cap = le32_to_cpu(resp_b->page_size_cap);
+ caps->pkey_table_len[0] = resp_b->pkey_table_len;
+ caps->phy_num_uars = resp_b->phy_num_uars;
+ ctx_hop_num = resp_b->ctx_hop_num;
+ pbl_hop_num = resp_b->pbl_hop_num;
caps->num_pds = 1 << hr_reg_read(resp_c, PF_CAPS_C_NUM_PDS);
@@ -2525,8 +2432,6 @@ static int hns_roce_query_pf_caps(struct hns_roce_dev *hr_dev)
caps->ceqe_depth = 1 << hr_reg_read(resp_d, PF_CAPS_D_CEQ_DEPTH);
caps->num_comp_vectors = hr_reg_read(resp_d, PF_CAPS_D_NUM_CEQS);
caps->aeqe_depth = 1 << hr_reg_read(resp_d, PF_CAPS_D_AEQ_DEPTH);
- caps->default_aeq_arm_st = hr_reg_read(resp_d, PF_CAPS_D_AEQ_ARM_ST);
- caps->default_ceq_arm_st = hr_reg_read(resp_d, PF_CAPS_D_CEQ_ARM_ST);
caps->reserved_pds = hr_reg_read(resp_d, PF_CAPS_D_RSV_PDS);
caps->num_uars = 1 << hr_reg_read(resp_d, PF_CAPS_D_NUM_UARS);
caps->reserved_qps = hr_reg_read(resp_d, PF_CAPS_D_RSV_QPS);
@@ -2537,10 +2442,6 @@ static int hns_roce_query_pf_caps(struct hns_roce_dev *hr_dev)
caps->reserved_cqs = hr_reg_read(resp_e, PF_CAPS_E_RSV_CQS);
caps->reserved_srqs = hr_reg_read(resp_e, PF_CAPS_E_RSV_SRQS);
caps->reserved_lkey = hr_reg_read(resp_e, PF_CAPS_E_RSV_LKEYS);
- caps->default_ceq_max_cnt = le16_to_cpu(resp_e->ceq_max_cnt);
- caps->default_ceq_period = le16_to_cpu(resp_e->ceq_period);
- caps->default_aeq_max_cnt = le16_to_cpu(resp_e->aeq_max_cnt);
- caps->default_aeq_period = le16_to_cpu(resp_e->aeq_period);
caps->qpc_hop_num = ctx_hop_num;
caps->sccc_hop_num = ctx_hop_num;
@@ -2557,6 +2458,20 @@ static int hns_roce_query_pf_caps(struct hns_roce_dev *hr_dev)
if (!(caps->page_size_cap & PAGE_SIZE))
caps->page_size_cap = HNS_ROCE_V2_PAGE_SIZE_SUPPORTED;
+
+ if (!hr_dev->is_vf) {
+ caps->cqe_sz = resp_a->cqe_sz;
+ caps->qpc_sz = le16_to_cpu(resp_b->qpc_sz);
+ caps->default_aeq_arm_st =
+ hr_reg_read(resp_d, PF_CAPS_D_AEQ_ARM_ST);
+ caps->default_ceq_arm_st =
+ hr_reg_read(resp_d, PF_CAPS_D_CEQ_ARM_ST);
+ caps->default_ceq_max_cnt = le16_to_cpu(resp_e->ceq_max_cnt);
+ caps->default_ceq_period = le16_to_cpu(resp_e->ceq_period);
+ caps->default_aeq_max_cnt = le16_to_cpu(resp_e->aeq_max_cnt);
+ caps->default_aeq_period = le16_to_cpu(resp_e->aeq_period);
+ }
+
return 0;
}
@@ -2626,7 +2541,11 @@ static int hns_roce_v2_vf_profile(struct hns_roce_dev *hr_dev)
hr_dev->func_num = 1;
- set_default_caps(hr_dev);
+ ret = hns_roce_query_caps(hr_dev);
+ if (ret) {
+ dev_err(dev, "failed to query VF caps, ret = %d.\n", ret);
+ return ret;
+ }
ret = hns_roce_query_vf_resource(hr_dev);
if (ret) {
@@ -2666,9 +2585,11 @@ static int hns_roce_v2_pf_profile(struct hns_roce_dev *hr_dev)
return ret;
}
- ret = hns_roce_query_pf_caps(hr_dev);
- if (ret)
- set_default_caps(hr_dev);
+ ret = hns_roce_query_caps(hr_dev);
+ if (ret) {
+ dev_err(dev, "failed to query PF caps, ret = %d.\n", ret);
+ return ret;
+ }
ret = hns_roce_query_pf_resource(hr_dev);
if (ret) {
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
index 90401577865e..e5f3a4639bf3 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
@@ -35,46 +35,16 @@
#include <linux/bitops.h>
-#define HNS_ROCE_V2_MAX_QP_NUM 0x1000
-#define HNS_ROCE_V2_MAX_QPC_TIMER_NUM 0x200
-#define HNS_ROCE_V2_MAX_WQE_NUM 0x8000
-#define HNS_ROCE_V2_MAX_SRQ_WR 0x8000
-#define HNS_ROCE_V2_MAX_SRQ_SGE 64
-#define HNS_ROCE_V2_MAX_CQ_NUM 0x100000
-#define HNS_ROCE_V2_MAX_CQC_TIMER_BT_NUM 0x100
-#define HNS_ROCE_V2_MAX_SRQ_NUM 0x100000
-#define HNS_ROCE_V2_MAX_CQE_NUM 0x400000
-#define HNS_ROCE_V2_MAX_RQ_SGE_NUM 64
-#define HNS_ROCE_V2_MAX_SQ_SGE_NUM 64
-#define HNS_ROCE_V2_MAX_EXTEND_SGE_NUM 0x200000
-#define HNS_ROCE_V2_MAX_SQ_INLINE 0x20
-#define HNS_ROCE_V3_MAX_SQ_INLINE 0x400
#define HNS_ROCE_V2_MAX_RC_INL_INN_SZ 32
-#define HNS_ROCE_V2_UAR_NUM 256
-#define HNS_ROCE_V2_PHY_UAR_NUM 1
+#define HNS_ROCE_V2_MTT_ENTRY_SZ 64
#define HNS_ROCE_V2_AEQE_VEC_NUM 1
#define HNS_ROCE_V2_ABNORMAL_VEC_NUM 1
-#define HNS_ROCE_V2_MAX_MTPT_NUM 0x100000
#define HNS_ROCE_V2_MAX_MTT_SEGS 0x1000000
#define HNS_ROCE_V2_MAX_SRQWQE_SEGS 0x1000000
#define HNS_ROCE_V2_MAX_IDX_SEGS 0x1000000
-#define HNS_ROCE_V2_MAX_PD_NUM 0x1000000
#define HNS_ROCE_V2_MAX_XRCD_NUM 0x1000000
#define HNS_ROCE_V2_RSV_XRCD_NUM 0
-#define HNS_ROCE_V2_MAX_QP_INIT_RDMA 128
-#define HNS_ROCE_V2_MAX_QP_DEST_RDMA 128
-#define HNS_ROCE_V2_MAX_SQ_DESC_SZ 64
-#define HNS_ROCE_V2_MAX_RQ_DESC_SZ 16
-#define HNS_ROCE_V2_MAX_SRQ_DESC_SZ 64
-#define HNS_ROCE_V2_IRRL_ENTRY_SZ 64
-#define HNS_ROCE_V2_EXT_ATOMIC_TRRL_ENTRY_SZ 100
-#define HNS_ROCE_V2_CQC_ENTRY_SZ 64
-#define HNS_ROCE_V2_SRQC_ENTRY_SZ 64
-#define HNS_ROCE_V2_MTPT_ENTRY_SZ 64
-#define HNS_ROCE_V2_MTT_ENTRY_SZ 64
-#define HNS_ROCE_V2_IDX_ENTRY_SZ 4
-#define HNS_ROCE_V2_SCCC_SZ 32
#define HNS_ROCE_V3_SCCC_SZ 64
#define HNS_ROCE_V3_GMV_ENTRY_SZ 32
@@ -242,6 +212,7 @@ enum hns_roce_opcode_type {
HNS_ROCE_OPC_QUERY_FUNC_INFO = 0x8407,
HNS_ROCE_OPC_QUERY_PF_CAPS_NUM = 0x8408,
HNS_ROCE_OPC_CFG_ENTRY_SIZE = 0x8409,
+ HNS_ROCE_OPC_QUERY_VF_CAPS_NUM = 0x8410,
HNS_ROCE_OPC_CFG_SGID_TB = 0x8500,
HNS_ROCE_OPC_CFG_SMAC_TB = 0x8501,
HNS_ROCE_OPC_POST_MB = 0x8504,
--
2.30.0
1
0

[RFC PATCH openEuler-1.0-LTS] sched: memqos: add memqos for dynamic affinity
by Wang ShaoBo 22 Mar '23
by Wang ShaoBo 22 Mar '23
22 Mar '23
Add debug memband interface to dynamic affinity, this would be
useful for those threads sensitive to memory bandwidth.
Signed-off-by: Wang ShaoBo <bobo.shaobowang(a)huawei.com>
---
arch/arm64/include/asm/mpam.h | 2 +
arch/arm64/include/asm/mpam_sched.h | 2 +
arch/arm64/kernel/mpam/mpam_device.c | 58 ++-
arch/arm64/kernel/mpam/mpam_resctrl.c | 65 ++++
include/linux/memqos.h | 142 +++++++
include/linux/sched.h | 14 +-
include/linux/sysctl.h | 2 +
kernel/cgroup/cpuset.c | 1 +
kernel/exit.c | 3 +
kernel/fork.c | 4 +
kernel/sched/Makefile | 1 +
kernel/sched/core.c | 29 +-
kernel/sched/fair.c | 14 +-
kernel/sched/memqos/Makefile | 6 +
kernel/sched/memqos/memqos.c | 297 +++++++++++++++
kernel/sched/memqos/phase_feature_sysctl.c | 126 +++++++
kernel/sched/memqos/phase_memband.c | 145 ++++++++
kernel/sched/memqos/phase_perf.c | 409 +++++++++++++++++++++
kernel/sched/memqos/phase_sim_knn.c | 92 +++++
kernel/sysctl.c | 7 +
mm/mempolicy.c | 10 +-
21 files changed, 1409 insertions(+), 20 deletions(-)
create mode 100644 include/linux/memqos.h
create mode 100644 kernel/sched/memqos/Makefile
create mode 100644 kernel/sched/memqos/memqos.c
create mode 100644 kernel/sched/memqos/phase_feature_sysctl.c
create mode 100644 kernel/sched/memqos/phase_memband.c
create mode 100644 kernel/sched/memqos/phase_perf.c
create mode 100644 kernel/sched/memqos/phase_sim_knn.c
diff --git a/arch/arm64/include/asm/mpam.h b/arch/arm64/include/asm/mpam.h
index 6338eab817e75..269a91d8ca907 100644
--- a/arch/arm64/include/asm/mpam.h
+++ b/arch/arm64/include/asm/mpam.h
@@ -4,6 +4,8 @@
#ifdef CONFIG_MPAM
extern int mpam_rmid_to_partid_pmg(int rmid, int *partid, int *pmg);
+
+void mpam_component_config_mbwu_mon(int partid, int pmg, int monitor, int *result, int nr);
#endif
#endif /* _ASM_ARM64_MPAM_H */
diff --git a/arch/arm64/include/asm/mpam_sched.h b/arch/arm64/include/asm/mpam_sched.h
index 08ed349b6efa1..32d08cf654b31 100644
--- a/arch/arm64/include/asm/mpam_sched.h
+++ b/arch/arm64/include/asm/mpam_sched.h
@@ -40,6 +40,8 @@ static inline void mpam_sched_in(void)
__mpam_sched_in();
}
+void __mpam_sched_in_v2(struct task_struct *tsk);
+
#else
static inline void mpam_sched_in(void) {}
diff --git a/arch/arm64/kernel/mpam/mpam_device.c b/arch/arm64/kernel/mpam/mpam_device.c
index 6455c69f132fd..48de3982a0b9a 100644
--- a/arch/arm64/kernel/mpam/mpam_device.c
+++ b/arch/arm64/kernel/mpam/mpam_device.c
@@ -84,14 +84,14 @@ void mpam_class_list_lock_held(void)
static inline u32 mpam_read_reg(struct mpam_device *dev, u16 reg)
{
WARN_ON_ONCE(reg > SZ_MPAM_DEVICE);
- assert_spin_locked(&dev->lock);
+ //assert_spin_locked(&dev->lock);
/*
* If we touch a device that isn't accessible from this CPU we may get
* an external-abort.
*/
- WARN_ON_ONCE(preemptible());
- WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &dev->fw_affinity));
+ //WARN_ON_ONCE(preemptible());
+ //WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &dev->fw_affinity));
return readl_relaxed(dev->mapped_hwpage + reg);
}
@@ -99,14 +99,14 @@ static inline u32 mpam_read_reg(struct mpam_device *dev, u16 reg)
static inline void mpam_write_reg(struct mpam_device *dev, u16 reg, u32 val)
{
WARN_ON_ONCE(reg > SZ_MPAM_DEVICE);
- assert_spin_locked(&dev->lock);
+ //assert_spin_locked(&dev->lock);
/*
* If we touch a device that isn't accessible from this CPU we may get
* an external-abort. If we're lucky, we corrupt another mpam:component.
*/
- WARN_ON_ONCE(preemptible());
- WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &dev->fw_affinity));
+ //WARN_ON_ONCE(preemptible());
+ //WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &dev->fw_affinity));
writel_relaxed(val, dev->mapped_hwpage + reg);
}
@@ -1208,6 +1208,7 @@ static u32 mpam_device_read_mbwu_mon(struct mpam_device *dev,
{
u16 mon;
u32 clt, flt, cur_clt, cur_flt;
+ u32 total = 0;
mon = args->mon;
@@ -1249,7 +1250,12 @@ static u32 mpam_device_read_mbwu_mon(struct mpam_device *dev,
wmb();
}
- return mpam_read_reg(dev, MSMON_MBWU);
+ total += mpam_read_reg(dev, MSMON_MBWU);
+ total += mpam_read_reg(dev, MSMON_MBWU);
+ total += mpam_read_reg(dev, MSMON_MBWU);
+ total += mpam_read_reg(dev, MSMON_MBWU);
+ total += mpam_read_reg(dev, MSMON_MBWU);
+ return total / 5;
}
static int mpam_device_frob_mon(struct mpam_device *dev,
@@ -1470,6 +1476,44 @@ static void mpam_component_device_sync(void *__ctx)
cpumask_set_cpu(smp_processor_id(), &ctx->updated_on);
}
+static DEFINE_SPINLOCK(mpam_tmp_lock);
+
+void mpam_component_config_mbwu_mon(int partid, int pmg, int monitor, int *result, int nr)
+{
+ struct mpam_class *class;
+ struct mpam_component *comp;
+ struct mpam_device *dev;
+ struct sync_args args;
+ int i = 0;
+
+ args.pmg = pmg;
+ args.mon = monitor;
+ args.closid.reqpartid = partid;
+ args.match_pmg = 1;
+
+ spin_lock(&mpam_tmp_lock);
+ list_for_each_entry(class, &mpam_classes, classes_list) {
+ if (class->type != MPAM_CLASS_MEMORY)
+ continue;
+
+ list_for_each_entry(comp, &class->components, class_list) {
+ if (i >= nr) {
+ pr_err_once("error, i > result nr");
+ break;
+ }
+ result[i] = 0;
+ list_for_each_entry(dev, &comp->devices, comp_list) {
+ result[i] += mpam_device_read_mbwu_mon(dev, &args);
+ }
+ i++;
+ }
+ break;
+ }
+ spin_unlock(&mpam_tmp_lock);
+
+}
+EXPORT_SYMBOL(mpam_component_config_mbwu_mon);
+
/**
* in some cases/platforms the MSC register access is only possible with
* the associated CPUs. And need to check if those CPUS are online before
diff --git a/arch/arm64/kernel/mpam/mpam_resctrl.c b/arch/arm64/kernel/mpam/mpam_resctrl.c
index 60d3d8706a38b..f4d87964616f2 100644
--- a/arch/arm64/kernel/mpam/mpam_resctrl.c
+++ b/arch/arm64/kernel/mpam/mpam_resctrl.c
@@ -2226,6 +2226,71 @@ int mpam_resctrl_init(void)
return resctrl_group_init();
}
+
+void __mpam_sched_in_v2(struct task_struct *tsk)
+{
+ struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+ u64 partid_d, partid_i;
+ u64 rmid = state->default_rmid;
+ u64 closid = state->default_closid;
+ u64 reqpartid = 0;
+ u64 pmg = 0;
+
+ /*
+ * If this task has a closid/rmid assigned, use it.
+ * Else use the closid/rmid assigned to this cpu.
+ */
+ if (static_branch_likely(&resctrl_alloc_enable_key)) {
+ if (tsk->closid)
+ closid = tsk->closid;
+ }
+
+ if (static_branch_likely(&resctrl_mon_enable_key)) {
+ if (tsk->rmid)
+ rmid = tsk->rmid;
+ }
+
+ if (closid != state->cur_closid || rmid != state->cur_rmid) {
+ u64 reg;
+
+ resctrl_navie_rmid_partid_pmg(rmid, (int *)&reqpartid, (int *)&pmg);
+
+ if (resctrl_cdp_enabled) {
+ resctrl_cdp_mpamid_map_val(reqpartid, CDP_DATA, partid_d);
+ resctrl_cdp_mpamid_map_val(reqpartid, CDP_CODE, partid_i);
+
+ /* set in EL0 */
+ reg = mpam_read_sysreg_s(SYS_MPAM0_EL1, "SYS_MPAM0_EL1");
+ reg = PARTID_D_SET(reg, partid_d);
+ reg = PARTID_I_SET(reg, partid_i);
+ reg = PMG_SET(reg, pmg);
+ mpam_write_sysreg_s(reg, SYS_MPAM0_EL1, "SYS_MPAM0_EL1");
+
+ /* set in EL1 */
+ reg = mpam_read_sysreg_s(SYS_MPAM1_EL1, "SYS_MPAM1_EL1");
+ reg = PARTID_D_SET(reg, partid_d);
+ reg = PARTID_I_SET(reg, partid_i);
+ reg = PMG_SET(reg, pmg);
+ mpam_write_sysreg_s(reg, SYS_MPAM1_EL1, "SYS_MPAM1_EL1");
+ } else {
+ /* set in EL0 */
+ reg = mpam_read_sysreg_s(SYS_MPAM0_EL1, "SYS_MPAM0_EL1");
+ reg = PARTID_SET(reg, reqpartid);
+ reg = PMG_SET(reg, pmg);
+ mpam_write_sysreg_s(reg, SYS_MPAM0_EL1, "SYS_MPAM0_EL1");
+
+ /* set in EL1 */
+ reg = mpam_read_sysreg_s(SYS_MPAM1_EL1, "SYS_MPAM1_EL1");
+ reg = PARTID_SET(reg, reqpartid);
+ reg = PMG_SET(reg, pmg);
+ mpam_write_sysreg_s(reg, SYS_MPAM1_EL1, "SYS_MPAM1_EL1");
+ }
+
+ state->cur_rmid = rmid;
+ state->cur_closid = closid;
+ }
+}
+
/*
* __intel_rdt_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR
*
diff --git a/include/linux/memqos.h b/include/linux/memqos.h
new file mode 100644
index 0000000000000..814e9935590d3
--- /dev/null
+++ b/include/linux/memqos.h
@@ -0,0 +1,142 @@
+#ifndef _MEMQOS_H
+#define _MEMQOS_H
+
+#include <linux/vmstat.h>
+#include <linux/rbtree.h>
+//#include <linux/sched.h>
+
+struct task_struct;
+
+struct memqos_domain {
+ int dom_id;
+ int total_memband_div_10;
+ int total_out_memband_div_10;
+
+ //record 10 timers
+ int memband_ringpos;
+ int memband_div_10_history[4][10];
+};
+
+struct memqos_mpam_profile {
+ int partid;
+ int pmg;
+ int monitor;
+
+ struct task_struct *tsk;
+ int used;
+};
+
+struct memqos_wait_profile {
+ struct memqos_mpam_profile *profile;
+ struct list_head wait_list;
+};
+
+struct memqos_class {
+ struct list_head turbo_list;
+ struct list_head tasks_list;
+};
+
+#include <linux/topology.h>
+//embed in task_struct
+
+struct task_memqos {
+ int ipc_ringpos;
+ int ipcx10;
+ int ipcx10_total[4];
+ int ipcx10_history[10];
+
+ int memband_div_10;
+ int memband_ringpos;
+ int memband_div_10_total[4];
+ int memband_div_10_history[4][10];
+
+ u32 sample_times;
+ int account_ready;
+ int numa_score[4];
+ int turbo;
+
+ struct memqos_wait_profile mpam_profile;
+
+ struct list_head turbo_list;
+ struct list_head task_list;
+
+ struct cpumask *advise_mem_node_mask;
+ int preferred_nid;
+
+ int class_id;
+
+ int corrupt;
+};
+
+#define PHASE_PEVENT_NUM 10
+
+struct phase_event_pcount {
+ u64 data[PHASE_PEVENT_NUM];
+};
+
+struct phase_event_count {
+ struct phase_event_pcount pcount;
+};
+
+void phase_update_mpam_label(struct task_struct *tsk);
+
+void phase_release_mpam_label(struct task_struct *tsk);
+
+static inline void memqos_update_mpam_label(struct task_struct *tsk)
+{
+ phase_update_mpam_label(tsk);
+}
+
+static inline void memqos_release_mpam_label(struct task_struct *tsk)
+{
+ phase_release_mpam_label(tsk);
+}
+
+void phase_destroy_waitqueue(struct task_struct *tsk);
+
+void phase_get_memband(struct memqos_mpam_profile *pm, int *result, int nr);
+
+DECLARE_STATIC_KEY_FALSE(sched_phase);
+DECLARE_STATIC_KEY_FALSE(sched_phase_printk);
+
+int phase_perf_create(void);
+
+void phase_perf_release(void);
+
+void memqos_account_task(struct task_struct *p, int cpu);
+
+void memqos_drop_class(struct task_struct *p);
+
+void phase_account_task(struct task_struct *p, int cpu);
+
+static inline void memqos_task_collect_data(struct task_struct *p, int cpu)
+{
+ phase_account_task(p, cpu);
+}
+
+static inline void memqos_task_account(struct task_struct *p, int cpu)
+{
+ memqos_account_task(p, cpu);
+}
+
+static inline void memqos_task_exit(struct task_struct *p)
+{
+
+ memqos_drop_class(p);
+ phase_destroy_waitqueue(p);
+}
+
+void memqos_select_nicest_cpus(struct task_struct *p);
+
+void memqos_exclude_low_level_task_single(struct task_struct *p);
+
+int knn_get_tag(int ipcx10, int memband_div_10);
+
+void memqos_init_class(struct task_struct *p);
+
+void phase_trace_printk(struct task_struct *p);
+static inline void memqos_trace_printk(struct task_struct *p)
+{
+ phase_trace_printk(p);
+}
+#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 928186f161000..5f710dc5bc03b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -29,6 +29,7 @@
#include <linux/task_io_accounting.h>
#include <linux/rseq.h>
#include <linux/thread_bits.h>
+#include <linux/memqos.h>
/* task_struct member predeclarations (sorted alphabetically): */
struct audit_context;
@@ -1268,7 +1269,7 @@ struct task_struct {
#if !defined(__GENKSYMS__)
#if defined(CONFIG_QOS_SCHED_DYNAMIC_AFFINITY)
cpumask_t *prefer_cpus;
- const cpumask_t *select_cpus;
+ cpumask_t *select_cpus;
#else
KABI_RESERVE(6)
KABI_RESERVE(7)
@@ -1279,6 +1280,10 @@ struct task_struct {
#endif
KABI_RESERVE(8)
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+ struct task_memqos sched_memqos;
+#endif
+
/* CPU-specific state of this task: */
struct thread_struct thread;
@@ -1998,6 +2003,13 @@ int set_prefer_cpus_ptr(struct task_struct *p,
const struct cpumask *new_mask);
int sched_prefer_cpus_fork(struct task_struct *p, struct task_struct *orig);
void sched_prefer_cpus_free(struct task_struct *p);
+static inline bool prefer_cpus_valid(struct task_struct *p)
+{
+ return p->prefer_cpus &&
+ !cpumask_empty(p->prefer_cpus) &&
+ !cpumask_equal(p->prefer_cpus, &p->cpus_allowed) &&
+ cpumask_subset(p->prefer_cpus, &p->cpus_allowed);
+}
#endif
#endif
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index b769ecfcc3bd4..73bce39107cb3 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -230,6 +230,8 @@ static inline void setup_sysctl_set(struct ctl_table_set *p,
#endif /* CONFIG_SYSCTL */
+extern struct ctl_table phase_table[];
+
int sysctl_max_threads(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 55bfbc4cdb16c..d94a9065a5605 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -106,6 +106,7 @@ struct cpuset {
nodemask_t mems_allowed;
#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
cpumask_var_t prefer_cpus;
+ int mem_turbo;
#endif
/* effective CPUs and Memory Nodes allow to tasks */
diff --git a/kernel/exit.c b/kernel/exit.c
index 2a32d32bdc03d..b731c19618176 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -699,6 +699,8 @@ static void check_stack_usage(void)
static inline void check_stack_usage(void) {}
#endif
+#include <linux/memqos.h>
+
void __noreturn do_exit(long code)
{
struct task_struct *tsk = current;
@@ -806,6 +808,7 @@ void __noreturn do_exit(long code)
* because of cgroup mode, must be called before cgroup_exit()
*/
perf_event_exit_task(tsk);
+ memqos_task_exit(tsk);
sched_autogroup_exit_task(tsk);
cgroup_exit(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index b5453a26655e2..0a762b92dc814 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -841,6 +841,8 @@ void set_task_stack_end_magic(struct task_struct *tsk)
*stackend = STACK_END_MAGIC; /* for overflow detection */
}
+
+#include <linux/memqos.h>
static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
struct task_struct *tsk;
@@ -923,6 +925,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
kcov_task_init(tsk);
+ memqos_init_class(tsk);
+
#ifdef CONFIG_FAULT_INJECTION
tsk->fail_nth = 0;
#endif
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 7fe183404c383..471380d6686e3 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -29,3 +29,4 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq.o
obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
obj-$(CONFIG_MEMBARRIER) += membarrier.o
obj-$(CONFIG_CPU_ISOLATION) += isolation.o
+obj-$(CONFIG_QOS_SCHED_DYNAMIC_AFFINITY) += memqos/
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 970616070da86..1171025aaa440 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2787,6 +2787,8 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
calculate_sigpending();
}
+#include <linux/memqos.h>
+
/*
* context_switch - switch to the new MM and the new thread's register state.
*/
@@ -2794,6 +2796,8 @@ static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next, struct rq_flags *rf)
{
+ struct rq *ret;
+
prepare_task_switch(rq, prev, next);
/*
@@ -2837,6 +2841,18 @@ context_switch(struct rq *rq, struct task_struct *prev,
}
}
+ //account and release
+ memqos_task_account(prev, smp_processor_id());
+
+ if (prefer_cpus_valid(prev))
+ memqos_trace_printk(prev);
+
+ memqos_release_mpam_label(prev);
+
+ //label new task's mpamid
+ if (prefer_cpus_valid(next))
+ memqos_update_mpam_label(next);
+
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
prepare_lock_switch(rq, next, rf);
@@ -2845,7 +2861,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
switch_to(prev, next, prev);
barrier();
- return finish_task_switch(prev);
+ ret = finish_task_switch(prev);
+
+ return ret;
}
/*
@@ -3058,8 +3076,12 @@ unsigned long long task_sched_runtime(struct task_struct *p)
void scheduler_tick(void)
{
int cpu = smp_processor_id();
+ //memqos clooect next cpu's memband and perf
+ int cpu_memqos = (cpu + 1) % nr_cpu_ids;
struct rq *rq = cpu_rq(cpu);
+ struct rq *rq_next = cpu_rq(cpu_memqos);
struct task_struct *curr = rq->curr;
+ struct task_struct *curr_memqos = rq_next->curr;
struct rq_flags rf;
sched_clock_tick();
@@ -3075,6 +3097,10 @@ void scheduler_tick(void)
perf_event_task_tick();
+ //only monitor task enabled dynamic affinity
+ if (curr_memqos && prefer_cpus_valid(curr_memqos))
+ memqos_task_collect_data(curr_memqos, cpu_memqos);
+
#ifdef CONFIG_SMP
rq->idle_balance = idle_cpu(cpu);
trigger_load_balance(rq);
@@ -3524,6 +3550,7 @@ static void __sched notrace __schedule(bool preempt)
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf);
} else {
+ memqos_task_account(prev, smp_processor_id());
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
rq_unlock_irq(rq, &rf);
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index af55a26d11fcb..12e9675495d2c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6675,6 +6675,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
}
#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+#include <linux/memqos.h>
/*
* Low utilization threshold for CPU
*
@@ -6749,14 +6750,6 @@ static inline int cpu_vutil_of(int cpu)
return cputime->vutil;
}
-static inline bool prefer_cpus_valid(struct task_struct *p)
-{
- return p->prefer_cpus &&
- !cpumask_empty(p->prefer_cpus) &&
- !cpumask_equal(p->prefer_cpus, &p->cpus_allowed) &&
- cpumask_subset(p->prefer_cpus, &p->cpus_allowed);
-}
-
/*
* set_task_select_cpus: select the cpu range for task
* @p: the task whose available cpu range will to set
@@ -6828,8 +6821,13 @@ static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu,
if (util_avg_sum < sysctl_sched_util_low_pct *
cpumask_weight(p->prefer_cpus)) {
p->select_cpus = p->prefer_cpus;
+ memqos_select_nicest_cpus(p);
if (sd_flag & SD_BALANCE_WAKE)
schedstat_inc(p->se.dyn_affi_stats->nr_wakeups_preferred_cpus);
+ } else {
+ //select trubo task
+ //select low class task
+ memqos_exclude_low_level_task_single(p);
}
}
#endif
diff --git a/kernel/sched/memqos/Makefile b/kernel/sched/memqos/Makefile
new file mode 100644
index 0000000000000..ed8f42649a8a7
--- /dev/null
+++ b/kernel/sched/memqos/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+# These files are disabled because they produce non-interesting flaky coverage
+# that is not a function of syscall inputs. E.g. involuntary context switches.
+KCOV_INSTRUMENT := n
+
+obj-y := memqos.o phase_feature_sysctl.o phase_memband.o phase_perf.o phase_sim_knn.o
diff --git a/kernel/sched/memqos/memqos.c b/kernel/sched/memqos/memqos.c
new file mode 100644
index 0000000000000..ddf8785439aa6
--- /dev/null
+++ b/kernel/sched/memqos/memqos.c
@@ -0,0 +1,297 @@
+#include <linux/memqos.h>
+#include <linux/cpumask.h>
+#include <linux/sched.h>
+
+static void memqos_set_task_classid(struct task_struct *p)
+{
+ int class_id;
+ int memband_div_10 = p->sched_memqos.memband_div_10;
+ int ipcx10 = p->sched_memqos.ipcx10;
+
+ class_id = knn_get_tag((u64)ipcx10, (u64)memband_div_10);
+ p->sched_memqos.class_id = class_id;
+}
+
+//static memqos_domain mq_domains[] = {
+// {.dom_id = 0, .total_memband = 0, .total_out_memband = 0,},
+// {.dom_id = 1, .total_memband = 0, .total_out_memband = 0,},
+// {.dom_id = 2, .total_memband = 0, .total_out_memband = 0,},
+// {.dom_id = 3, .total_memband = 0, .total_out_memband = 0,},
+//};
+
+static DEFINE_PER_CPU(struct memqos_class, memqos_classes[8]);
+//static DEFINE_PER_CPU(spinlock_t, memqos_class_lock);
+static DEFINE_SPINLOCK(memqos_class_lock);
+
+static int memqos_class_online(unsigned int cpu)
+{
+ int class_id = 0;
+ struct memqos_class *class;
+
+ for (class_id = 0; class_id < 8; class_id++) {
+ class = &per_cpu(memqos_classes, cpu)[class_id];
+ INIT_LIST_HEAD(&class->tasks_list);
+ INIT_LIST_HEAD(&class->turbo_list);
+ }
+ return 0;
+}
+
+static int memqos_class_offline(unsigned int cpu)
+{
+ return 0;
+}
+
+#include <linux/cpu.h>
+#include <linux/cacheinfo.h>
+
+static void memqos_init(void)
+{
+ int cpuhp_state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+ "memqos:online", memqos_class_online,
+ memqos_class_offline);
+ if (cpuhp_state <= 0) {
+ pr_err("Failed to register 'dyn' cpuhp callbacks");
+ return;
+ }
+}
+late_initcall(memqos_init);
+
+static void memqos_insert_to_class(struct task_struct *p, int cpu)
+{
+ unsigned long flag;
+ int class_id = p->sched_memqos.class_id;
+ struct memqos_class *class;
+ struct task_memqos *memqos;
+
+ if (class_id >= 8)
+ return;
+
+ memqos = &p->sched_memqos;
+
+ class = &per_cpu(memqos_classes, cpu)[class_id];
+
+ spin_lock_irqsave(&memqos_class_lock, flag);
+ if (p->sched_memqos.corrupt) {
+ spin_unlock_irqrestore(&memqos_class_lock, flag);
+ return;
+ }
+
+ //pr_info("count:%d %d add (%llx) %llx %llx to list %llx!!!!!!!!!!!!!\n", count, p->pid, &p->sched_memqos.task_list, p->sched_memqos.task_list.next, p->sched_memqos.task_list.prev, &class->tasks_list);
+ list_move_tail(&p->sched_memqos.task_list, &class->tasks_list);
+ if (memqos->turbo)
+ list_move_tail(&p->sched_memqos.turbo_list, &class->turbo_list);
+ spin_unlock_irqrestore(&memqos_class_lock, flag);
+}
+
+static void memqos_drop_class_without_lock(struct task_struct *p)
+{
+ //pr_info("%d drop (%llx) %llx %llx to list %llx!!!!!!!!!!!!!\n", p->pid, &p->sched_memqos.task_list, p->sched_memqos.task_list.next, p->sched_memqos.task_list.prev);
+ list_del_init(&p->sched_memqos.task_list);
+ list_del_init(&p->sched_memqos.turbo_list);
+}
+
+static void memqos_score(struct task_struct *p)
+{
+ int total_n1 = p->sched_memqos.memband_div_10_total[0];
+ int total_n2 = p->sched_memqos.memband_div_10_total[1];
+ int total_n3 = p->sched_memqos.memband_div_10_total[2];
+ int total_n4 = p->sched_memqos.memband_div_10_total[3];
+
+ p->sched_memqos.numa_score[0] = (total_n1 - (total_n2 + total_n3 + total_n4)) * 10 / total_n1;
+ p->sched_memqos.numa_score[1] = (total_n2 - (total_n1 + total_n3 + total_n4)) * 10 / total_n2;
+ p->sched_memqos.numa_score[2] = (total_n3 - (total_n1 + total_n2 + total_n4)) * 10 / total_n3;
+ p->sched_memqos.numa_score[3] = (total_n4 - (total_n1 + total_n2 + total_n3)) * 10 / total_n4;
+
+ //over x% percent
+ if (p->sched_memqos.numa_score[0] > 0)
+ p->sched_memqos.turbo = 1;
+ else if (p->sched_memqos.numa_score[1] > 0)
+ p->sched_memqos.turbo = 2;
+ else if (p->sched_memqos.numa_score[2] > 0)
+ p->sched_memqos.turbo = 3;
+ else if (p->sched_memqos.numa_score[3] > 0)
+ p->sched_memqos.turbo = 4;
+ else
+ p->sched_memqos.turbo = 0;
+}
+
+void memqos_account_task(struct task_struct *p, int cpu)
+{
+ if (!p->sched_memqos.account_ready ||
+ p->sched_memqos.corrupt)
+ return;
+ memqos_set_task_classid(p);
+ memqos_insert_to_class(p, cpu);
+ memqos_score(p);
+ p->sched_memqos.account_ready = 0;
+}
+
+void memqos_init_class(struct task_struct *p)
+{
+ memset(&p->sched_memqos, 0, sizeof(struct task_memqos));
+ spin_lock(&memqos_class_lock);
+ INIT_LIST_HEAD(&p->sched_memqos.task_list);
+ INIT_LIST_HEAD(&p->sched_memqos.turbo_list);
+ INIT_LIST_HEAD(&p->sched_memqos.mpam_profile.wait_list);
+ spin_unlock(&memqos_class_lock);
+
+ p->closid = 0;
+ p->rmid = 0;
+}
+
+//destroy ?
+void memqos_drop_class(struct task_struct *p)
+{
+ spin_lock(&memqos_class_lock);
+ memqos_drop_class_without_lock(p);
+ p->sched_memqos.corrupt = 1;
+ spin_unlock(&memqos_class_lock);
+}
+
+void memqos_select_nicest_cpus(struct task_struct *p)
+{
+ int i = 0;
+ int max_score = -10000;
+ int select_node = 0;
+ struct task_memqos *memqos = &p->sched_memqos;
+
+ if (!memqos->turbo) {
+ for (i = 0; i < 4; i++) {
+ if (!cpumask_intersects(cpumask_of_node(i), p->select_cpus))
+ continue;
+
+ if (memqos->numa_score[i] > max_score) {
+ select_node = i;
+ max_score = memqos->numa_score[i];
+ }
+ }
+
+ cpumask_and(p->select_cpus, p->select_cpus, cpumask_of_node(select_node));
+ //p->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node);
+ p->sched_memqos.preferred_nid = memqos->turbo;
+ return;
+ }
+
+ select_node = memqos->turbo - 1;
+ if (cpumask_intersects(cpumask_of_node(select_node), p->select_cpus)) {
+ cpumask_and(p->select_cpus, p->select_cpus, cpumask_of_node(select_node));
+ //p->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node);
+ p->sched_memqos.preferred_nid = memqos->turbo;
+ }
+
+ return;
+
+ //if turbo another cpus, wait...
+}
+
+void memqos_exclude_low_level_task_single(struct task_struct *p)
+{
+ int i, j, cpu;
+ int find = 0;
+ int select_node = 0;
+ const struct cpumask *cpumask;
+ struct cpumask *cpumask_med;
+ struct memqos_class *class;
+ struct task_memqos *memqos = &p->sched_memqos;;
+ struct task_struct *tsk = NULL;
+ int max_score = -100000;
+
+ if (memqos->turbo) {
+ select_node = memqos->turbo - 1;
+ cpumask = cpumask_of_node(select_node);
+ if (!cpumask_intersects(cpumask, p->prefer_cpus) &&
+ (cpumask_intersects(&p->cpus_allowed, cpumask))) {
+ cpumask_and(p->select_cpus, &p->cpus_allowed, cpumask);
+ memqos_drop_class(p);
+ //p->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node);
+ p->sched_memqos.preferred_nid = memqos->turbo;
+ return;
+ } else if (cpumask_intersects(p->prefer_cpus, cpumask)) {
+ cpumask_and(p->select_cpus, p->prefer_cpus, cpumask);
+ //p->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node);
+ p->sched_memqos.preferred_nid = memqos->turbo;
+ }
+ }
+
+ //select turbo one
+ for (cpu = 0; cpu < nr_cpu_ids; i++) {
+ if (!cpumask_test_cpu(cpu, p->prefer_cpus))
+ continue;
+
+ spin_lock(&memqos_class_lock);
+ for (i = 7; i >= 0; i--) {
+ class = &per_cpu(memqos_classes, cpu)[i];
+ list_for_each_entry(memqos, &class->turbo_list, turbo_list) {
+ if (!memqos->turbo)
+ continue;
+ select_node = memqos->turbo - 1;
+ cpumask = cpumask_of_node(select_node);
+ if (!cpumask_intersects(cpumask, p->prefer_cpus)) {
+ tsk = container_of(memqos, struct task_struct, sched_memqos);
+ if (!cpumask_intersects(cpumask, &tsk->cpus_allowed))
+ continue;
+ cpumask_and(tsk->select_cpus, &tsk->cpus_allowed, cpumask);
+ //mem prefered
+ //tsk->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node);
+ tsk->sched_memqos.preferred_nid = memqos->turbo;
+ find = 1;
+ break;
+ }
+ }
+ if (find) {
+ memqos_drop_class_without_lock(tsk);
+ spin_unlock(&memqos_class_lock);
+ return;
+ }
+ }
+ spin_unlock(&memqos_class_lock);
+ }
+
+ find = 0;
+
+ //if not, select lower class's tsk
+ for (cpu = 0; cpu < nr_cpu_ids; i++) {
+ if (!cpumask_test_cpu(cpu, p->prefer_cpus))
+ continue;
+
+ spin_lock(&memqos_class_lock);
+ //only find below class tsk
+ for (i = 0; i < memqos->class_id; i++) {
+ class = &per_cpu(memqos_classes, cpu)[i];
+ list_for_each_entry(memqos, &class->tasks_list, task_list) {
+ if (memqos->turbo)
+ continue;
+
+ tsk = container_of(memqos, struct task_struct, sched_memqos);
+ for (j = 0; j < 4; j++) {
+ if (!cpumask_intersects(cpumask_of_node(i), &tsk->cpus_allowed))
+ continue;
+ if (memqos->numa_score[j] > max_score) {
+ select_node = j;
+ max_score = memqos->numa_score[j];
+ }
+ find = 1;
+ }
+ if (!find)
+ continue;
+
+ cpumask_and(cpumask_med, cpumask_of_node(select_node), &tsk->cpus_allowed);
+ cpumask_andnot(cpumask_med, cpumask_med, p->prefer_cpus);
+ if (cpumask_empty(cpumask_med))
+ continue;
+ cpumask_copy(tsk->select_cpus, cpumask_med);
+ //mem prefered
+ //tsk->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node);
+ tsk->sched_memqos.preferred_nid = memqos->turbo;
+ memqos_drop_class_without_lock(tsk);
+ spin_unlock(&memqos_class_lock);
+ return;
+ }
+ }
+ spin_unlock(&memqos_class_lock);
+ }
+
+ //do not care, this task may out
+ return;
+}
+
diff --git a/kernel/sched/memqos/phase_feature_sysctl.c b/kernel/sched/memqos/phase_feature_sysctl.c
new file mode 100644
index 0000000000000..443ae03275605
--- /dev/null
+++ b/kernel/sched/memqos/phase_feature_sysctl.c
@@ -0,0 +1,126 @@
+#include <linux/sched.h>
+#include <linux/sysctl.h>
+#include <linux/capability.h>
+#include <linux/cpumask.h>
+#include <linux/topology.h>
+#include <linux/sched/task.h>
+
+#include <linux/memqos.h>
+
+#ifdef CONFIG_PROC_SYSCTL
+
+DEFINE_STATIC_KEY_FALSE(sched_phase);
+DEFINE_STATIC_KEY_FALSE(sched_phase_printk);
+
+static int set_phase_state(bool enabled)
+{
+ int err;
+ int state = static_branch_likely(&sched_phase);
+
+ if (enabled == state) {
+ pr_warn("phase has already %s\n", state ? "enabled" : "disabled");
+ return 0;
+ }
+
+ if (enabled) {
+ err = phase_perf_create();
+ if (err) {
+ pr_err("phase enable failed\n");
+ return err;
+ }
+ static_branch_enable(&sched_phase);
+ pr_info("phase enabled\n");
+ } else {
+ static_branch_disable(&sched_phase);
+ phase_perf_release();
+ pr_info("phase disabled\n");
+ }
+
+ return 0;
+}
+
+/*
+ * the other procfs files of phase cannot be modified if sched_phase is already enabled
+ */
+static int phase_proc_state(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ int err;
+ int state = static_branch_likely(&sched_phase);
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &state;
+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+ if (write)
+ err = set_phase_state(state);
+
+ return err;
+}
+
+static int set_phase_state_printk(bool enabled)
+{
+ if (enabled) {
+ static_branch_enable(&sched_phase_printk);
+ } else {
+ static_branch_disable(&sched_phase_printk);
+ }
+
+ return 0;
+}
+
+/*
+ * the other procfs files of phase cannot be modified if sched_phase is already enabled
+ */
+static int phase_proc_state_printk(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ int err;
+ int state = static_branch_likely(&sched_phase);
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &state;
+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+ if (write)
+ err = set_phase_state_printk(state);
+
+ return err;
+}
+
+
+static int __maybe_unused zero;
+static int __maybe_unused one = 1;
+
+struct ctl_table phase_table[] = {
+ {
+ .procname = "enabled",
+ .data = NULL,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = phase_proc_state,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
+ .procname = "trace_enabled",
+ .data = NULL,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = phase_proc_state_printk,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ { }
+};
+#endif /* CONFIG_PROC_SYSCTL */
diff --git a/kernel/sched/memqos/phase_memband.c b/kernel/sched/memqos/phase_memband.c
new file mode 100644
index 0000000000000..d83909c8eca45
--- /dev/null
+++ b/kernel/sched/memqos/phase_memband.c
@@ -0,0 +1,145 @@
+#include <linux/types.h>
+#include <linux/cpu.h>
+#include <linux/memqos.h>
+
+#include <asm/cpu.h>
+#include <asm/cputype.h>
+#include <asm/cpufeature.h>
+#include <asm/mpam_sched.h>
+
+static const int nr_partid = 15;
+static const int nr_monitor = 4;
+
+static LIST_HEAD(phase_mpam_waitqueue);
+
+//mpam_profile_res[0] not used
+struct memqos_mpam_profile mpam_profile_res[16] = {
+ { .partid = 0, .monitor = 0, .used = 1},
+ { .partid = 1, .monitor = 0,},
+ { .partid = 2, .monitor = 1,},
+ { .partid = 3, .monitor = 2,},
+ { .partid = 4, .monitor = 3,},
+ { .partid = 5, .monitor = 0,},
+ { .partid = 6, .monitor = 1,},
+ { .partid = 7, .monitor = 2,},
+ { .partid = 8, .monitor = 3,},
+ { .partid = 9, .monitor = 0,},
+ { .partid = 10, .monitor = 1,},
+ { .partid = 11, .monitor = 2,},
+ { .partid = 12, .monitor = 3,},
+ { .partid = 13, .monitor = 0,},
+ { .partid = 14, .monitor = 1,},
+ { .partid = 15, .monitor = 2,},
+};
+
+static DEFINE_SPINLOCK(phase_partid_lock);
+
+void phase_update_mpam_label(struct task_struct *tsk)
+{
+ int i = 0;
+ //unsigned long flag;
+
+ WARN_ON_ONCE(tsk->closid);
+
+ if (tsk->sched_memqos.mpam_profile.profile != &mpam_profile_res[0] &&
+ tsk->sched_memqos.mpam_profile.profile != NULL) {
+ tsk->closid = tsk->sched_memqos.mpam_profile.profile->partid;
+ tsk->rmid = 0;
+ mpam_profile_res[tsk->closid].tsk = tsk;
+ __mpam_sched_in_v2(tsk);
+ return;
+ }
+
+ spin_lock(&phase_partid_lock);
+ //is in profile queue, wait...
+ if (tsk->sched_memqos.mpam_profile.profile == &mpam_profile_res[0]) {
+ spin_unlock(&phase_partid_lock);
+ return;
+ }
+
+ for (i = 1; i < 16; i++) {
+ if (mpam_profile_res[i].used) {
+ continue;
+ }
+
+ tsk->sched_memqos.mpam_profile.profile = NULL;
+ break;
+ }
+
+ if (i == 16) {
+ list_move_tail(&tsk->sched_memqos.mpam_profile.wait_list, &phase_mpam_waitqueue);
+ tsk->sched_memqos.mpam_profile.profile = &mpam_profile_res[0];
+ spin_unlock(&phase_partid_lock);
+ //wait...
+ return;
+ }
+
+ mpam_profile_res[i].used = 1;
+ spin_unlock(&phase_partid_lock);
+
+ tsk->closid = mpam_profile_res[i].partid;
+ mpam_profile_res[i].tsk = tsk;
+ tsk->sched_memqos.mpam_profile.profile = &mpam_profile_res[i];
+ tsk->rmid = 0;
+ __mpam_sched_in_v2(tsk);
+}
+
+static void phase_release_mpam_label_without_lock(struct task_struct *tsk)
+{
+ int closid;
+ struct memqos_wait_profile *next;
+
+ //assert locked
+
+ if (tsk->closid == 0)
+ return;
+
+ closid = tsk->closid;
+ tsk->closid = 0;
+ tsk->sched_memqos.mpam_profile.profile = NULL;
+ mpam_profile_res[closid].used = 0;
+ mpam_profile_res[closid].tsk = NULL;
+
+ next = list_first_entry_or_null(&phase_mpam_waitqueue, struct memqos_wait_profile, wait_list);
+ if (next) {
+ list_del_init(&next->wait_list);
+ next->profile = &mpam_profile_res[closid];
+ mpam_profile_res[closid].used = 1;
+ }
+
+ return;
+}
+
+//task shutdown
+void phase_destroy_waitqueue(struct task_struct *tsk)
+{
+ spin_lock(&phase_partid_lock);
+
+ //if (tsk->sched_memqos.mpam_profile.profile == &mpam_profile_res[0]) {
+ list_del_init(&tsk->sched_memqos.mpam_profile.wait_list);
+ //} else {
+ phase_release_mpam_label_without_lock(tsk);
+ //}
+ spin_unlock(&phase_partid_lock);
+}
+
+void phase_release_mpam_label(struct task_struct *tsk)
+{
+ spin_lock(&phase_partid_lock);
+ phase_release_mpam_label_without_lock(tsk);
+ spin_unlock(&phase_partid_lock);
+}
+
+#include <asm/mpam.h>
+void phase_get_memband(struct memqos_mpam_profile *pm, int *result, int nr)
+{
+ if (pm == &mpam_profile_res[0] || pm == NULL) {
+ result[0] = 0;
+ result[1] = 0;
+ result[2] = 0;
+ result[3] = 0;
+ return;
+ }
+
+ mpam_component_config_mbwu_mon(pm->partid, pm->pmg, pm->monitor, result, nr);
+}
diff --git a/kernel/sched/memqos/phase_perf.c b/kernel/sched/memqos/phase_perf.c
new file mode 100644
index 0000000000000..9b450a20e808f
--- /dev/null
+++ b/kernel/sched/memqos/phase_perf.c
@@ -0,0 +1,409 @@
+#include <linux/kernel.h>
+#include <linux/perf_event.h>
+#include <linux/percpu-defs.h>
+#include <linux/slab.h>
+#include <linux/stop_machine.h>
+#include <linux/memqos.h>
+#include <linux/sched.h>
+
+#define PHASE_FEVENT_NUM 3
+
+int *phase_perf_pevents = NULL;
+
+static DEFINE_PER_CPU(__typeof__(struct perf_event *)[PHASE_PEVENT_NUM], cpu_phase_perf_events);
+
+/******************************************
+ * Helpers for phase perf event
+ *****************************************/
+static inline struct perf_event *perf_event_of_cpu(int cpu, int index)
+{
+ return per_cpu(cpu_phase_perf_events, cpu)[index];
+}
+
+static inline struct perf_event **perf_events_of_cpu(int cpu)
+{
+ return per_cpu(cpu_phase_perf_events, cpu);
+}
+
+static inline u64 perf_event_local_pmu_read(struct perf_event *event)
+{
+ if (event->state == PERF_EVENT_STATE_ACTIVE)
+ event->pmu->read(event);
+ return local64_read(&event->count);
+}
+
+/******************************************
+ * Helpers for cpu counters
+ *****************************************/
+static inline u64 read_cpu_counter(int cpu, int index)
+{
+ struct perf_event *event = perf_event_of_cpu(cpu, index);
+
+ if (!event || !event->pmu)
+ return 0;
+
+ return perf_event_local_pmu_read(event);
+}
+
+static struct perf_event_attr *alloc_attr(int event_id)
+{
+ struct perf_event_attr *attr;
+
+ attr = kzalloc(sizeof(struct perf_event_attr), GFP_KERNEL);
+ if (!attr)
+ return ERR_PTR(-ENOMEM);
+
+ attr->type = PERF_TYPE_RAW;
+ attr->config = event_id;
+ attr->size = sizeof(struct perf_event_attr);
+ attr->pinned = 1;
+ attr->disabled = 1;
+ //attr->exclude_hv;
+ //attr->exclude_idle;
+ //attr->exclude_kernel;
+
+ return attr;
+}
+
+static int create_cpu_counter(int cpu, int event_id, int index)
+{
+ struct perf_event_attr *attr = NULL;
+ struct perf_event **events = perf_events_of_cpu(cpu);
+ struct perf_event *event = NULL;
+
+ attr = alloc_attr(event_id);
+ if (IS_ERR(attr))
+ return PTR_ERR(attr);
+
+ event = perf_event_create_kernel_counter(attr, cpu, NULL, NULL, NULL);
+ if (IS_ERR(event)) {
+ pr_err("unable to create perf event (cpu:%i-type:%d-pinned:%d-config:0x%llx) : %ld",
+ cpu, attr->type, attr->pinned, attr->config, PTR_ERR(event));
+ kfree(attr);
+ return PTR_ERR(event);
+ } else {
+ events[index] = event;
+ perf_event_enable(events[index]);
+ if (event->hw.idx == -1) {
+ pr_err("pinned event unable to get onto hardware, perf event (cpu:%i-type:%d-config:0x%llx)",
+ cpu, attr->type, attr->config);
+ kfree(attr);
+ return -EINVAL;
+ }
+ pr_info("create perf_event (cpu:%i-idx:%d-type:%d-pinned:%d-exclude_hv:%d"
+ "-exclude_idle:%d-exclude_kernel:%d-config:0x%llx-addr:%px)",
+ event->cpu, event->hw.idx,
+ event->attr.type, event->attr.pinned, event->attr.exclude_hv,
+ event->attr.exclude_idle, event->attr.exclude_kernel,
+ event->attr.config, event);
+ }
+
+ kfree(attr);
+ return 0;
+}
+
+static int release_cpu_counter(int cpu, int event_id, int index)
+{
+ struct perf_event **events = perf_events_of_cpu(cpu);
+ struct perf_event *event = NULL;
+
+ event = events[index];
+
+ if (!event)
+ return 0;
+
+ pr_info("release perf_event (cpu:%i-idx:%d-type:%d-pinned:%d-exclude_hv:%d"
+ "-exclude_idle:%d-exclude_kernel:%d-config:0x%llx)",
+ event->cpu, event->hw.idx,
+ event->attr.type, event->attr.pinned, event->attr.exclude_hv,
+ event->attr.exclude_idle, event->attr.exclude_kernel,
+ event->attr.config);
+
+ perf_event_release_kernel(event);
+ events[index] = NULL;
+
+ return 0;
+}
+
+enum {
+ CYCLES_INDEX = 0,
+ INST_RETIRED_INDEX,
+ PHASE_EVENT_FINAL_TERMINATOR
+};
+
+#define CYCLES 0x0011
+#define INST_RETIRED 0x0008
+
+static int pevents[PHASE_PEVENT_NUM] = {
+ CYCLES,
+ INST_RETIRED,
+ PHASE_EVENT_FINAL_TERMINATOR,
+};
+
+#define for_each_phase_pevents(index, events) \
+ for (index = 0; events != NULL && index < PHASE_PEVENT_NUM && \
+ events[index] != PHASE_EVENT_FINAL_TERMINATOR; index++)
+
+
+/******************************************
+ * Helpers for phase perf
+ *****************************************/
+static int do_pevents(int (*fn)(int, int, int), int cpu)
+{
+ int index;
+ int err;
+
+ for_each_phase_pevents(index, phase_perf_pevents) {
+ err = fn(cpu, phase_perf_pevents[index], index);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int __phase_perf_create(void *args)
+{
+ int err;
+ int cpu = raw_smp_processor_id();
+
+ /* create pinned events */
+ pr_info("create pinned events\n");
+ err = do_pevents(create_cpu_counter, cpu);
+ if (err) {
+ pr_err("create pinned events failed\n");
+ do_pevents(release_cpu_counter, cpu);
+ return err;
+ }
+
+ pr_info("[%d] phase class event create success\n", cpu);
+ return 0;
+}
+
+static int do_phase_perf_create(int *pevents, const struct cpumask *cpus)
+{
+ phase_perf_pevents = pevents;
+ return stop_machine(__phase_perf_create, NULL, cpus);
+}
+
+static int __do_phase_perf_release(void *args)
+{
+ int cpu = raw_smp_processor_id();
+
+ /* release pinned events */
+ pr_info("release pinned events\n");
+ do_pevents(release_cpu_counter, cpu);
+
+ pr_info("[%d] phase class event release success\n", cpu);
+ return 0;
+}
+
+static void do_phase_perf_release(const struct cpumask *cpus)
+{
+ stop_machine(__do_phase_perf_release, NULL, cpus);
+ phase_perf_pevents = NULL;
+}
+
+int phase_perf_create(void)
+{
+ return do_phase_perf_create(pevents, cpu_possible_mask);
+}
+
+void phase_perf_release(void)
+{
+ do_phase_perf_release(cpu_possible_mask);
+}
+
+DECLARE_STATIC_KEY_FALSE(sched_phase);
+DECLARE_STATIC_KEY_FALSE(sched_phase_printk);
+
+#define PHASE_EVENT_OVERFLOW (~0ULL)
+
+static inline u64 phase_event_count_sub(u64 curr, u64 prev)
+{
+ if (curr < prev) { /* ovewrflow */
+ u64 tmp = PHASE_EVENT_OVERFLOW - prev;
+ return curr + tmp;
+ } else {
+ return curr - prev;
+ }
+}
+
+static inline void phase_calc_delta(struct task_struct *p,
+ struct phase_event_count *prev,
+ struct phase_event_count *curr,
+ struct phase_event_count *delta)
+{
+ int *pevents = phase_perf_pevents;
+ int index;
+
+ for_each_phase_pevents(index, pevents) {
+ delta->pcount.data[index] = phase_event_count_sub(curr->pcount.data[index], prev->pcount.data[index]);
+ }
+}
+
+static inline u64 phase_data_of_pevent(struct phase_event_pcount *counter, int event_id)
+{
+ int index;
+ int *events = phase_perf_pevents;
+
+ for_each_phase_pevents(index, events) {
+ if (event_id == events[index])
+ return counter->data[index];
+ }
+
+ return 0;
+}
+
+static int cal_ring_history_average(int *history, int nr, int s_pos, int c_nr)
+{
+ int average = 0;
+ int start = ((s_pos - c_nr) + nr) % nr;
+
+ if (start < 0)
+ return 0;
+
+ for (;start != s_pos;) {
+ if (history[start] == 0) {
+ c_nr--;
+ if (c_nr == 0)
+ return 0;
+ continue;
+ }
+ average += history[start];
+ start = (start + 1) % nr;
+ }
+
+ return start / c_nr;
+}
+
+static void __phase_cal_ipcx10(struct task_struct *p, struct phase_event_count *delta)
+{
+ u64 ins;
+ u64 cycles;
+ //invalid zero
+ int ipcx10 = 0;
+
+ ins = phase_data_of_pevent(&delta->pcount, INST_RETIRED_INDEX);
+ cycles = phase_data_of_pevent(&delta->pcount, CYCLES_INDEX);
+
+ if (cycles)
+ ipcx10 = (ins * 10) / cycles;
+
+ if (static_branch_unlikely(&sched_phase_printk)) {
+ trace_printk("ins:%lld cycles:%lld\n", ins, cycles);
+ }
+
+ p->sched_memqos.ipcx10_history[p->sched_memqos.ipc_ringpos] = ipcx10;
+ p->sched_memqos.ipc_ringpos = (p->sched_memqos.ipc_ringpos + 1) % 10;
+ cal_ring_history_average(p->sched_memqos.ipcx10_history, 10, p->sched_memqos.ipc_ringpos, 5);
+}
+
+static void __phase_cal_memband_div_10(struct task_struct *p)
+{
+ int pos;
+ int result[4];
+
+ pos = p->sched_memqos.memband_ringpos;
+
+ phase_get_memband(p->sched_memqos.mpam_profile.profile, result, 4);
+
+ if (static_branch_unlikely(&sched_phase_printk)) {
+ trace_printk("memband:%d %d %d %d profile:%llx\n", result[0], result[1], result[2], result[3], p->sched_memqos.mpam_profile.profile);
+ }
+
+ p->sched_memqos.memband_div_10_total[0] = p->sched_memqos.memband_div_10_total[0] - p->sched_memqos.memband_div_10_history[0][pos];
+ p->sched_memqos.memband_div_10_total[0] = p->sched_memqos.memband_div_10_total[0] + result[0] / 10;
+ p->sched_memqos.memband_div_10_history[0][p->sched_memqos.memband_ringpos] = result[0] / 10;
+
+ p->sched_memqos.memband_div_10_total[1] = p->sched_memqos.memband_div_10_total[1] - p->sched_memqos.memband_div_10_history[1][pos];
+ p->sched_memqos.memband_div_10_total[1] = p->sched_memqos.memband_div_10_total[1] + result[1] / 10;
+ p->sched_memqos.memband_div_10_history[1][p->sched_memqos.memband_ringpos] = result[1] / 10;
+
+ p->sched_memqos.memband_div_10_total[2] = p->sched_memqos.memband_div_10_total[2] - p->sched_memqos.memband_div_10_history[2][pos];
+ p->sched_memqos.memband_div_10_total[2] = p->sched_memqos.memband_div_10_total[2] + result[2] / 10;
+ p->sched_memqos.memband_div_10_history[2][p->sched_memqos.memband_ringpos] = result[2] / 10;
+
+ p->sched_memqos.memband_div_10_total[3] = p->sched_memqos.memband_div_10_total[3] - p->sched_memqos.memband_div_10_history[3][pos];
+ p->sched_memqos.memband_div_10_total[3] = p->sched_memqos.memband_div_10_total[3] + result[3] / 10;
+ p->sched_memqos.memband_div_10_history[3][p->sched_memqos.memband_ringpos] = result[3] / 10;
+
+ p->sched_memqos.memband_ringpos = (pos + 1) % 10;
+
+ cal_ring_history_average(p->sched_memqos.memband_div_10_history[0], 10, pos, 5);
+ cal_ring_history_average(p->sched_memqos.memband_div_10_history[1], 10, pos, 5);
+ cal_ring_history_average(p->sched_memqos.memband_div_10_history[2], 10, pos, 5);
+ cal_ring_history_average(p->sched_memqos.memband_div_10_history[3], 10, pos, 5);
+}
+
+static DEFINE_PER_CPU(struct phase_event_count, prev_phase_event_count);
+static DEFINE_PER_CPU(struct phase_event_count, curr_phase_event_count);
+
+static void phase_perf_read_events(int cpu, u64 *pdata)
+{
+ int index;
+
+ for_each_phase_pevents(index, phase_perf_pevents) {
+ pdata[index] = read_cpu_counter(cpu, index);
+ }
+}
+
+static inline struct phase_event_count *phase_read_prev(unsigned int cpu)
+{
+ return &per_cpu(prev_phase_event_count, cpu);
+}
+
+static inline struct phase_event_count *phase_read_curr(unsigned int cpu)
+{
+ struct phase_event_count *curr = &per_cpu(curr_phase_event_count, cpu);
+
+ phase_perf_read_events(cpu, curr->pcount.data);
+
+ return curr;
+}
+
+void phase_account_task(struct task_struct *p, int cpu)
+{
+ struct phase_event_count delta;
+ struct phase_event_count *prev, *curr;
+
+ if (!static_branch_likely(&sched_phase))
+ return;
+
+ //if (!sched_core_enabled(cpu_rq(cpu)))
+ // return;
+
+ /* update phase_event_count */
+ prev = phase_read_prev(cpu);
+ curr = phase_read_curr(cpu);
+ phase_calc_delta(p, prev, curr, &delta);
+ *prev = *curr;
+
+ /* calculate phase */
+ __phase_cal_ipcx10(p, &delta);
+ __phase_cal_memband_div_10(p);
+ p->sched_memqos.sample_times++;
+ if ((p->sched_memqos.sample_times % 3) == 0)
+ p->sched_memqos.account_ready = 1;
+}
+
+
+void phase_trace_printk(struct task_struct *p)
+{
+ if (!static_branch_unlikely(&sched_phase_printk))
+ return;
+
+ trace_printk("p->comm:%s(%d) ipcpos:%d ipcx10:%d membandpos:%d memband_div_10:%d numa_score[0]:%d numa_score[1]:%d numa_score[2]:%d numa_score[3]:%d turbo:%d prefered_nid:%d classid:%d partid:%d\n",
+ p->comm, p->pid, p->sched_memqos.ipc_ringpos,\
+ p->sched_memqos.ipcx10, \
+ p->sched_memqos.memband_ringpos,\
+ p->sched_memqos.memband_div_10, \
+ p->sched_memqos.numa_score[0], \
+ p->sched_memqos.numa_score[1], \
+ p->sched_memqos.numa_score[2], \
+ p->sched_memqos.numa_score[3], \
+ p->sched_memqos.turbo, \
+ p->sched_memqos.preferred_nid, \
+ p->sched_memqos.class_id, \
+ p->closid);
+}
diff --git a/kernel/sched/memqos/phase_sim_knn.c b/kernel/sched/memqos/phase_sim_knn.c
new file mode 100644
index 0000000000000..b80bb6b9ae0a3
--- /dev/null
+++ b/kernel/sched/memqos/phase_sim_knn.c
@@ -0,0 +1,92 @@
+#include <linux/types.h>
+
+#define DATA_ROW 20
+void QuickSort(u64 arr[DATA_ROW][2], int L, int R) {
+ int i = L;
+ int j = R;
+ int kk = (L + R) / 2;
+ u64 pivot = arr[kk][0];
+
+ while (i <= j) {
+ while (pivot > arr[i][0]) {
+ i++;
+ }
+ while (pivot < arr[j][0]) {
+ j--;
+ }
+ if (i <= j) {
+ u64 temp = arr[i][0];
+
+ arr[i][0] = arr[j][0];
+ arr[j][0] = temp;
+ i++; j--;
+ }
+ }
+ if (L < j) {
+ QuickSort(arr, L, j);
+ }
+ if (i < R) {
+ QuickSort(arr, i, R);
+ }
+}
+
+u64 euclidean_distance(u64 *row1, u64 *row2, int col) {
+ u64 distance = 0;
+ int i;
+
+ for (i = 0; i < col - 1; i++) {
+ distance += ((row1[i] - row2[i]) * (row1[i] - row2[i]));
+ }
+ return distance;
+}
+
+#define num_neighbors 6
+#define MAX_TAG 8
+
+int get_neighbors_tag(u64 train_data[DATA_ROW][3], int train_row, int col, u64 *test_row) {
+ int i;
+ u64 neighbors[MAX_TAG] = {0};
+ int max_tag = 0;
+ u64 distances[DATA_ROW][2];
+
+ for (i = 0; i < train_row; i++) {
+ distances[i][0] = euclidean_distance(train_data[i], test_row, col);
+ distances[i][1] = train_data[i][col - 1];
+ }
+ QuickSort(distances, 0, train_row - 1);
+ for (i = 0; i < num_neighbors; i++) {
+ neighbors[distances[i][1]]++;
+ if (neighbors[distances[i][1]] > neighbors[max_tag])
+ max_tag = distances[i][1];
+ }
+ return max_tag;
+}
+
+static u64 train_data[DATA_ROW][3] = {
+ {0, 1, 0},
+ {0, 9, 0},
+ {0, 20, 1},
+ {0, 30, 1},
+ {0, 40, 2},
+ {0, 50, 3},
+ {0, 60, 3},
+ {0, 70, 3},
+ {0, 80, 4},
+ {0, 90, 4},
+ {0, 100, 4},
+ {0, 110, 5},
+ {0, 120, 5},
+ {0, 130, 6},
+ {0, 140, 6},
+ {0, 150, 7},
+};
+
+int knn_get_tag(int ipcx10, int memband_div_10)
+{
+ u64 test_data[2];
+
+ test_data[0] = ipcx10;
+ test_data[1] = memband_div_10;
+
+ return get_neighbors_tag(train_data, DATA_ROW, 3, test_data);
+}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 685f9881b8e23..0d2764c4449ce 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -465,6 +465,13 @@ static struct ctl_table kern_table[] = {
.extra2 = &one,
},
#endif /* CONFIG_NUMA_BALANCING */
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+ {
+ .procname = "phase",
+ .mode = 0555,
+ .child = phase_table,
+ },
+#endif
#endif /* CONFIG_SCHED_DEBUG */
{
.procname = "sched_rt_period_us",
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4cac46d56f387..d748c291e7047 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2164,12 +2164,15 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
{
struct mempolicy *pol;
struct page *page;
- int preferred_nid;
+ int preferred_nid = -1;
nodemask_t *nmask;
+ if (current->sched_memqos.preferred_nid)
+ preferred_nid = current->sched_memqos.preferred_nid - 1;
+
pol = get_vma_policy(vma, addr);
- if (pol->mode == MPOL_INTERLEAVE) {
+ if (pol->mode == MPOL_INTERLEAVE && preferred_nid != -1) {
unsigned nid;
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
@@ -2233,7 +2236,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
}
nmask = policy_nodemask(gfp, pol);
- preferred_nid = policy_node(gfp, pol, node);
+ if (preferred_nid == -1)
+ preferred_nid = policy_node(gfp, pol, node);
page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
mark_vma_cdm(nmask, page, vma);
mpol_cond_put(pol);
--
2.25.1
1
0

22 Mar '23
Send 16 patches to test patchwork->PR function
Baisong Zhong (1):
media: dvb-usb: az6027: fix null-ptr-deref in az6027_i2c_xfer()
Chen Zhongjin (1):
ftrace: Fix invalid address access in lookup_rec() when index is 0
Darrick J. Wong (1):
ext4: fix another off-by-one fsmap error on 1k block filesystems
David Hildenbrand (2):
mm: optimize do_wp_page() for exclusive pages in the swapcache
mm: optimize do_wp_page() for fresh pages in local LRU pagevecs
Kuniyuki Iwashima (1):
seccomp: Move copy_seccomp() to no failure path.
Li Huafei (2):
livepatch: Cleanup klp_mem_prepare()
livepatch: Narrow the scope of the 'text_mutex' lock
Luke D. Jones (1):
HID: asus: Remove check for same LED brightness on set
Nicholas Piggin (1):
mm/vmalloc: huge vmalloc backing pages should be split rather than
compound
Pietro Borrello (2):
HID: asus: use spinlock to protect concurrent accesses
HID: asus: use spinlock to safely schedule workers
Xin Long (2):
tipc: set con sock in tipc_conn_alloc
tipc: add an extra conn_get in tipc_conn_alloc
Zheng Yejian (1):
livepatch/core: Fix hungtask against cpu hotplug on x86
Zhihao Cheng (1):
jbd2: fix data missing when reusing bh which is ready to be
checkpointed
arch/x86/kernel/livepatch.c | 11 +++++--
drivers/hid/hid-asus.c | 38 ++++++++++++++++++-----
drivers/media/usb/dvb-usb/az6027.c | 4 +++
fs/ext4/fsmap.c | 2 ++
fs/jbd2/transaction.c | 50 +++++++++++++++++-------------
kernel/fork.c | 17 ++++++----
kernel/livepatch/core.c | 49 ++++++++++++++++++++---------
kernel/trace/ftrace.c | 3 +-
mm/memory.c | 28 +++++++++++++----
mm/vmalloc.c | 22 ++++++++++---
net/tipc/topsrv.c | 20 ++++++------
11 files changed, 172 insertions(+), 72 deletions(-)
--
2.25.1
1
16

[PATCH openEuler-5.10-LTS-SP1 01/16] jbd2: fix data missing when reusing bh which is ready to be checkpointed
by Jialin Zhang 22 Mar '23
by Jialin Zhang 22 Mar '23
22 Mar '23
From: Zhihao Cheng <chengzhihao1(a)huawei.com>
mainline inclusion
from mainline-v6.3-rc1
commit e6b9bd7290d334451ce054e98e752abc055e0034
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I6C5HV
CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
--------------------------------
Following process will make data lost and could lead to a filesystem
corrupted problem:
1. jh(bh) is inserted into T1->t_checkpoint_list, bh is dirty, and
jh->b_transaction = NULL
2. T1 is added into journal->j_checkpoint_transactions.
3. Get bh prepare to write while doing checkpoing:
PA PB
do_get_write_access jbd2_log_do_checkpoint
spin_lock(&jh->b_state_lock)
if (buffer_dirty(bh))
clear_buffer_dirty(bh) // clear buffer dirty
set_buffer_jbddirty(bh)
transaction =
journal->j_checkpoint_transactions
jh = transaction->t_checkpoint_list
if (!buffer_dirty(bh))
__jbd2_journal_remove_checkpoint(jh)
// bh won't be flushed
jbd2_cleanup_journal_tail
__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved)
4. Aborting journal/Power-cut before writing latest bh on journal area.
In this way we get a corrupted filesystem with bh's data lost.
Fix it by moving the clearing of buffer_dirty bit just before the call
to __jbd2_journal_file_buffer(), both bit clearing and jh->b_transaction
assignment are under journal->j_list_lock locked, so that
jbd2_log_do_checkpoint() will wait until jh's new transaction fininshed
even bh is currently not dirty. And journal_shrink_one_cp_list() won't
remove jh from checkpoint list if the buffer head is reused in
do_get_write_access().
Fetch a reproducer in [Link].
Link: https://bugzilla.kernel.org/show_bug.cgi?id=216898
Cc: <stable(a)kernel.org>
Signed-off-by: Zhihao Cheng <chengzhihao1(a)huawei.com>
Signed-off-by: zhanchengbin <zhanchengbin1(a)huawei.com>
Suggested-by: Jan Kara <jack(a)suse.cz>
Reviewed-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20230110015327.1181863-1-chengzhihao1@huawei.com
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
Reviewed-by: Yang Erkun <yangerkun(a)huawei.com>
Reviewed-by: Zhang Yi <yi.zhang(a)huawei.com>
Signed-off-by: Jialin Zhang <zhangjialin11(a)huawei.com>
---
fs/jbd2/transaction.c | 50 +++++++++++++++++++++++++------------------
1 file changed, 29 insertions(+), 21 deletions(-)
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index cefee2dead54..8fa88c42fcb4 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -984,36 +984,28 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
* ie. locked but not dirty) or tune2fs (which may actually have
* the buffer dirtied, ugh.) */
- if (buffer_dirty(bh)) {
+ if (buffer_dirty(bh) && jh->b_transaction) {
+ warn_dirty_buffer(bh);
/*
- * First question: is this buffer already part of the current
- * transaction or the existing committing transaction?
- */
- if (jh->b_transaction) {
- J_ASSERT_JH(jh,
- jh->b_transaction == transaction ||
- jh->b_transaction ==
- journal->j_committing_transaction);
- if (jh->b_next_transaction)
- J_ASSERT_JH(jh, jh->b_next_transaction ==
- transaction);
- warn_dirty_buffer(bh);
- }
- /*
- * In any case we need to clean the dirty flag and we must
- * do it under the buffer lock to be sure we don't race
- * with running write-out.
+ * We need to clean the dirty flag and we must do it under the
+ * buffer lock to be sure we don't race with running write-out.
*/
JBUFFER_TRACE(jh, "Journalling dirty buffer");
clear_buffer_dirty(bh);
+ /*
+ * The buffer is going to be added to BJ_Reserved list now and
+ * nothing guarantees jbd2_journal_dirty_metadata() will be
+ * ever called for it. So we need to set jbddirty bit here to
+ * make sure the buffer is dirtied and written out when the
+ * journaling machinery is done with it.
+ */
set_buffer_jbddirty(bh);
}
- unlock_buffer(bh);
-
error = -EROFS;
if (is_handle_aborted(handle)) {
spin_unlock(&jh->b_state_lock);
+ unlock_buffer(bh);
goto out;
}
error = 0;
@@ -1023,8 +1015,10 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
* b_next_transaction points to it
*/
if (jh->b_transaction == transaction ||
- jh->b_next_transaction == transaction)
+ jh->b_next_transaction == transaction) {
+ unlock_buffer(bh);
goto done;
+ }
/*
* this is the first time this transaction is touching this buffer,
@@ -1048,10 +1042,24 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
*/
smp_wmb();
spin_lock(&journal->j_list_lock);
+ if (test_clear_buffer_dirty(bh)) {
+ /*
+ * Execute buffer dirty clearing and jh->b_transaction
+ * assignment under journal->j_list_lock locked to
+ * prevent bh being removed from checkpoint list if
+ * the buffer is in an intermediate state (not dirty
+ * and jh->b_transaction is NULL).
+ */
+ JBUFFER_TRACE(jh, "Journalling dirty buffer");
+ set_buffer_jbddirty(bh);
+ }
__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
spin_unlock(&journal->j_list_lock);
+ unlock_buffer(bh);
goto done;
}
+ unlock_buffer(bh);
+
/*
* If there is already a copy-out version of this buffer, then we don't
* need to make another one
--
2.25.1
1
15

[PATCH openEuler-5.10-LTS-SP1 01/14] jbd2: fix data missing when reusing bh which is ready to be checkpointed
by Jialin Zhang 22 Mar '23
by Jialin Zhang 22 Mar '23
22 Mar '23
From: Zhihao Cheng <chengzhihao1(a)huawei.com>
mainline inclusion
from mainline-v6.3-rc1
commit e6b9bd7290d334451ce054e98e752abc055e0034
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I6C5HV
CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
--------------------------------
Following process will make data lost and could lead to a filesystem
corrupted problem:
1. jh(bh) is inserted into T1->t_checkpoint_list, bh is dirty, and
jh->b_transaction = NULL
2. T1 is added into journal->j_checkpoint_transactions.
3. Get bh prepare to write while doing checkpoing:
PA PB
do_get_write_access jbd2_log_do_checkpoint
spin_lock(&jh->b_state_lock)
if (buffer_dirty(bh))
clear_buffer_dirty(bh) // clear buffer dirty
set_buffer_jbddirty(bh)
transaction =
journal->j_checkpoint_transactions
jh = transaction->t_checkpoint_list
if (!buffer_dirty(bh))
__jbd2_journal_remove_checkpoint(jh)
// bh won't be flushed
jbd2_cleanup_journal_tail
__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved)
4. Aborting journal/Power-cut before writing latest bh on journal area.
In this way we get a corrupted filesystem with bh's data lost.
Fix it by moving the clearing of buffer_dirty bit just before the call
to __jbd2_journal_file_buffer(), both bit clearing and jh->b_transaction
assignment are under journal->j_list_lock locked, so that
jbd2_log_do_checkpoint() will wait until jh's new transaction fininshed
even bh is currently not dirty. And journal_shrink_one_cp_list() won't
remove jh from checkpoint list if the buffer head is reused in
do_get_write_access().
Fetch a reproducer in [Link].
Link: https://bugzilla.kernel.org/show_bug.cgi?id=216898
Cc: <stable(a)kernel.org>
Signed-off-by: Zhihao Cheng <chengzhihao1(a)huawei.com>
Signed-off-by: zhanchengbin <zhanchengbin1(a)huawei.com>
Suggested-by: Jan Kara <jack(a)suse.cz>
Reviewed-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20230110015327.1181863-1-chengzhihao1@huawei.com
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
Reviewed-by: Yang Erkun <yangerkun(a)huawei.com>
Reviewed-by: Zhang Yi <yi.zhang(a)huawei.com>
Signed-off-by: Jialin Zhang <zhangjialin11(a)huawei.com>
---
fs/jbd2/transaction.c | 50 +++++++++++++++++++++++++------------------
1 file changed, 29 insertions(+), 21 deletions(-)
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index cefee2dead54..8fa88c42fcb4 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -984,36 +984,28 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
* ie. locked but not dirty) or tune2fs (which may actually have
* the buffer dirtied, ugh.) */
- if (buffer_dirty(bh)) {
+ if (buffer_dirty(bh) && jh->b_transaction) {
+ warn_dirty_buffer(bh);
/*
- * First question: is this buffer already part of the current
- * transaction or the existing committing transaction?
- */
- if (jh->b_transaction) {
- J_ASSERT_JH(jh,
- jh->b_transaction == transaction ||
- jh->b_transaction ==
- journal->j_committing_transaction);
- if (jh->b_next_transaction)
- J_ASSERT_JH(jh, jh->b_next_transaction ==
- transaction);
- warn_dirty_buffer(bh);
- }
- /*
- * In any case we need to clean the dirty flag and we must
- * do it under the buffer lock to be sure we don't race
- * with running write-out.
+ * We need to clean the dirty flag and we must do it under the
+ * buffer lock to be sure we don't race with running write-out.
*/
JBUFFER_TRACE(jh, "Journalling dirty buffer");
clear_buffer_dirty(bh);
+ /*
+ * The buffer is going to be added to BJ_Reserved list now and
+ * nothing guarantees jbd2_journal_dirty_metadata() will be
+ * ever called for it. So we need to set jbddirty bit here to
+ * make sure the buffer is dirtied and written out when the
+ * journaling machinery is done with it.
+ */
set_buffer_jbddirty(bh);
}
- unlock_buffer(bh);
-
error = -EROFS;
if (is_handle_aborted(handle)) {
spin_unlock(&jh->b_state_lock);
+ unlock_buffer(bh);
goto out;
}
error = 0;
@@ -1023,8 +1015,10 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
* b_next_transaction points to it
*/
if (jh->b_transaction == transaction ||
- jh->b_next_transaction == transaction)
+ jh->b_next_transaction == transaction) {
+ unlock_buffer(bh);
goto done;
+ }
/*
* this is the first time this transaction is touching this buffer,
@@ -1048,10 +1042,24 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
*/
smp_wmb();
spin_lock(&journal->j_list_lock);
+ if (test_clear_buffer_dirty(bh)) {
+ /*
+ * Execute buffer dirty clearing and jh->b_transaction
+ * assignment under journal->j_list_lock locked to
+ * prevent bh being removed from checkpoint list if
+ * the buffer is in an intermediate state (not dirty
+ * and jh->b_transaction is NULL).
+ */
+ JBUFFER_TRACE(jh, "Journalling dirty buffer");
+ set_buffer_jbddirty(bh);
+ }
__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
spin_unlock(&journal->j_list_lock);
+ unlock_buffer(bh);
goto done;
}
+ unlock_buffer(bh);
+
/*
* If there is already a copy-out version of this buffer, then we don't
* need to make another one
--
2.25.1
1
13

[PATCH openEuler-1.0-LTS] ext4: fix another off-by-one fsmap error on 1k block filesystems
by Yongqiang Liu 22 Mar '23
by Yongqiang Liu 22 Mar '23
22 Mar '23
From: "Darrick J. Wong" <djwong(a)kernel.org>
mainline inclusion
from mainline-v6.3-rc2
commit c993799baf9c5861f8df91beb80e1611b12efcbd
category: bugfix
bugzilla: 188522,https://gitee.com/openeuler/kernel/issues/I6N7ZP
CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
--------------------------------
Apparently syzbot figured out that issuing this FSMAP call:
struct fsmap_head cmd = {
.fmh_count = ...;
.fmh_keys = {
{ .fmr_device = /* ext4 dev */, .fmr_physical = 0, },
{ .fmr_device = /* ext4 dev */, .fmr_physical = 0, },
},
...
};
ret = ioctl(fd, FS_IOC_GETFSMAP, &cmd);
Produces this crash if the underlying filesystem is a 1k-block ext4
filesystem:
kernel BUG at fs/ext4/ext4.h:3331!
invalid opcode: 0000 [#1] PREEMPT SMP
CPU: 3 PID: 3227965 Comm: xfs_io Tainted: G W O 6.2.0-rc8-achx
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.15.0-1 04/01/2014
RIP: 0010:ext4_mb_load_buddy_gfp+0x47c/0x570 [ext4]
RSP: 0018:ffffc90007c03998 EFLAGS: 00010246
RAX: ffff888004978000 RBX: ffffc90007c03a20 RCX: ffff888041618000
RDX: 0000000000000000 RSI: 00000000000005a4 RDI: ffffffffa0c99b11
RBP: ffff888012330000 R08: ffffffffa0c2b7d0 R09: 0000000000000400
R10: ffffc90007c03950 R11: 0000000000000000 R12: 0000000000000001
R13: 00000000ffffffff R14: 0000000000000c40 R15: ffff88802678c398
FS: 00007fdf2020c880(0000) GS:ffff88807e100000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007ffd318a5fe8 CR3: 000000007f80f001 CR4: 00000000001706e0
Call Trace:
<TASK>
ext4_mballoc_query_range+0x4b/0x210 [ext4 dfa189daddffe8fecd3cdfd00564e0f265a8ab80]
ext4_getfsmap_datadev+0x713/0x890 [ext4 dfa189daddffe8fecd3cdfd00564e0f265a8ab80]
ext4_getfsmap+0x2b7/0x330 [ext4 dfa189daddffe8fecd3cdfd00564e0f265a8ab80]
ext4_ioc_getfsmap+0x153/0x2b0 [ext4 dfa189daddffe8fecd3cdfd00564e0f265a8ab80]
__ext4_ioctl+0x2a7/0x17e0 [ext4 dfa189daddffe8fecd3cdfd00564e0f265a8ab80]
__x64_sys_ioctl+0x82/0xa0
do_syscall_64+0x2b/0x80
entry_SYSCALL_64_after_hwframe+0x46/0xb0
RIP: 0033:0x7fdf20558aff
RSP: 002b:00007ffd318a9e30 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
RAX: ffffffffffffffda RBX: 00000000000200c0 RCX: 00007fdf20558aff
RDX: 00007fdf1feb2010 RSI: 00000000c0c0583b RDI: 0000000000000003
RBP: 00005625c0634be0 R08: 00005625c0634c40 R09: 0000000000000001
R10: 0000000000000000 R11: 0000000000000246 R12: 00007fdf1feb2010
R13: 00005625be70d994 R14: 0000000000000800 R15: 0000000000000000
For GETFSMAP calls, the caller selects a physical block device by
writing its block number into fsmap_head.fmh_keys[01].fmr_device.
To query mappings for a subrange of the device, the starting byte of the
range is written to fsmap_head.fmh_keys[0].fmr_physical and the last
byte of the range goes in fsmap_head.fmh_keys[1].fmr_physical.
IOWs, to query what mappings overlap with bytes 3-14 of /dev/sda, you'd
set the inputs as follows:
fmh_keys[0] = { .fmr_device = major(8, 0), .fmr_physical = 3},
fmh_keys[1] = { .fmr_device = major(8, 0), .fmr_physical = 14},
Which would return you whatever is mapped in the 12 bytes starting at
physical offset 3.
The crash is due to insufficient range validation of keys[1] in
ext4_getfsmap_datadev. On 1k-block filesystems, block 0 is not part of
the filesystem, which means that s_first_data_block is nonzero.
ext4_get_group_no_and_offset subtracts this quantity from the blocknr
argument before cracking it into a group number and a block number
within a group. IOWs, block group 0 spans blocks 1-8192 (1-based)
instead of 0-8191 (0-based) like what happens with larger blocksizes.
The net result of this encoding is that blocknr < s_first_data_block is
not a valid input to this function. The end_fsb variable is set from
the keys that are copied from userspace, which means that in the above
example, its value is zero. That leads to an underflow here:
blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
The division then operates on -1:
offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >>
EXT4_SB(sb)->s_cluster_bits;
Leaving an impossibly large group number (2^32-1) in blocknr.
ext4_getfsmap_check_keys checked that keys[0].fmr_physical and
keys[1].fmr_physical are in increasing order, but
ext4_getfsmap_datadev adjusts keys[0].fmr_physical to be at least
s_first_data_block. This implies that we have to check it again after
the adjustment, which is the piece that I forgot.
Reported-by: syzbot+6be2b977c89f79b6b153(a)syzkaller.appspotmail.com
Fixes: 4a4956249dac ("ext4: fix off-by-one fsmap error on 1k block filesystems")
Link: https://syzkaller.appspot.com/bug?id=79d5768e9bfe362911ac1a5057a36fc6b5c300…
Cc: stable(a)vger.kernel.org
Signed-off-by: Darrick J. Wong <djwong(a)kernel.org>
Link: https://lore.kernel.org/r/Y+58NPTH7VNGgzdd@magnolia
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
Signed-off-by: Baokun Li <libaokun1(a)huawei.com>
Reviewed-by: Zhihao Cheng <chengzhihao1(a)huawei.com>
Reviewed-by: Zhang Yi <yi.zhang(a)huawei.com>
Signed-off-by: Yongqiang Liu <liuyongqiang13(a)huawei.com>
---
fs/ext4/fsmap.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index 6f3f245f3a80..6b52ace1463c 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c
@@ -486,6 +486,8 @@ static int ext4_getfsmap_datadev(struct super_block *sb,
keys[0].fmr_physical = bofs;
if (keys[1].fmr_physical >= eofs)
keys[1].fmr_physical = eofs - 1;
+ if (keys[1].fmr_physical < keys[0].fmr_physical)
+ return 0;
start_fsb = keys[0].fmr_physical;
end_fsb = keys[1].fmr_physical;
--
2.25.1
1
0
您好!
Kernel SIG 邀请您参加 2023-03-24 14:00 召开的Zoom会议(自动录制)
会议主题:openEuler Kernel SIG双周例会
会议内容:
1. 进展update
2. 议题征集中
欢迎大家积极申报议题(新增议题可以直接回复邮件,或录入会议看板)
会议链接:https://us06web.zoom.us/j/85131900036?pwd=RFZVYlZFK3B1RVhpTHROOW82OTdLQT09
会议纪要:https://etherpad.openeuler.org/p/Kernel-meetings
温馨提醒:建议接入会议后修改参会人的姓名,也可以使用您在gitee.com的ID
更多资讯尽在:https://openeuler.org/zh/
Hello!
openEuler Kernel SIG invites you to attend the Zoom conference(auto recording) will be held at 2023-03-24 14:00,
The subject of the conference is openEuler Kernel SIG双周例会,
Summary:
1. 进展update
2. 议题征集中
欢迎大家积极申报议题(新增议题可以直接回复邮件,或录入会议看板)
You can join the meeting at https://us06web.zoom.us/j/85131900036?pwd=RFZVYlZFK3B1RVhpTHROOW82OTdLQT09.
Add topics at https://etherpad.openeuler.org/p/Kernel-meetings.
Note: You are advised to change the participant name after joining the conference or use your ID at gitee.com.
More information: https://openeuler.org/en/
1
0

21 Mar '23
From: Xin Long <lucien.xin(a)gmail.com>
stable inclusion
from stable-v4.19.268
commit 2c9c64a95d97727c9ada0d35abc90ee5fdbaeff7
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I6NCRH
CVE: CVE-2023-1382
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id…
--------------------------------
[ Upstream commit 0e5d56c64afcd6fd2d132ea972605b66f8a7d3c4 ]
A crash was reported by Wei Chen:
BUG: kernel NULL pointer dereference, address: 0000000000000018
RIP: 0010:tipc_conn_close+0x12/0x100
Call Trace:
tipc_topsrv_exit_net+0x139/0x320
ops_exit_list.isra.9+0x49/0x80
cleanup_net+0x31a/0x540
process_one_work+0x3fa/0x9f0
worker_thread+0x42/0x5c0
It was caused by !con->sock in tipc_conn_close(). In tipc_topsrv_accept(),
con is allocated in conn_idr then its sock is set:
con = tipc_conn_alloc();
... <----[1]
con->sock = newsock;
If tipc_conn_close() is called in anytime of [1], the null-pointer-def
is triggered by con->sock->sk due to con->sock is not yet set.
This patch fixes it by moving the con->sock setting to tipc_conn_alloc()
under s->idr_lock. So that con->sock can never be NULL when getting the
con from s->conn_idr. It will be also safer to move con->server and flag
CF_CONNECTED setting under s->idr_lock, as they should all be set before
tipc_conn_alloc() is called.
Fixes: c5fa7b3cf3cb ("tipc: introduce new TIPC server infrastructure")
Reported-by: Wei Chen <harperchen1110(a)gmail.com>
Signed-off-by: Xin Long <lucien.xin(a)gmail.com>
Acked-by: Jon Maloy <jmaloy(a)redhat.com>
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
conflict:
net/tipc/topsrv.c
Signed-off-by: Lu Wei <luwei32(a)huawei.com>
Reviewed-by: Liu Jian <liujian56(a)huawei.com>
Reviewed-by: Xiu Jianfeng <xiujianfeng(a)huawei.com>
Signed-off-by: Yongqiang Liu <liuyongqiang13(a)huawei.com>
---
net/tipc/topsrv.c | 11 +++++------
1 file changed, 5 insertions(+), 6 deletions(-)
diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c
index 1c4733153d74..89a1f127dfaf 100644
--- a/net/tipc/topsrv.c
+++ b/net/tipc/topsrv.c
@@ -184,7 +184,7 @@ static void tipc_conn_close(struct tipc_conn *con)
conn_put(con);
}
-static struct tipc_conn *tipc_conn_alloc(struct tipc_topsrv *s)
+static struct tipc_conn *tipc_conn_alloc(struct tipc_topsrv *s, struct socket *sock)
{
struct tipc_conn *con;
int ret;
@@ -210,10 +210,11 @@ static struct tipc_conn *tipc_conn_alloc(struct tipc_topsrv *s)
}
con->conid = ret;
s->idr_in_use++;
- spin_unlock_bh(&s->idr_lock);
set_bit(CF_CONNECTED, &con->flags);
con->server = s;
+ con->sock = sock;
+ spin_unlock_bh(&s->idr_lock);
return con;
}
@@ -467,7 +468,7 @@ static void tipc_topsrv_accept(struct work_struct *work)
ret = kernel_accept(lsock, &newsock, O_NONBLOCK);
if (ret < 0)
return;
- con = tipc_conn_alloc(srv);
+ con = tipc_conn_alloc(srv, newsock);
if (IS_ERR(con)) {
ret = PTR_ERR(con);
sock_release(newsock);
@@ -479,7 +480,6 @@ static void tipc_topsrv_accept(struct work_struct *work)
newsk->sk_data_ready = tipc_conn_data_ready;
newsk->sk_write_space = tipc_conn_write_space;
newsk->sk_user_data = con;
- con->sock = newsock;
write_unlock_bh(&newsk->sk_callback_lock);
/* Wake up receive process in case of 'SYN+' message */
@@ -577,12 +577,11 @@ bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower,
sub.filter = filter;
*(u32 *)&sub.usr_handle = port;
- con = tipc_conn_alloc(tipc_topsrv(net));
+ con = tipc_conn_alloc(tipc_topsrv(net), NULL);
if (IS_ERR(con))
return false;
*conid = con->conid;
- con->sock = NULL;
rc = tipc_conn_rcv_sub(tipc_topsrv(net), con, &sub);
if (rc >= 0)
return true;
--
2.25.1
1
1