Kernel
Threads by month
- ----- 2025 -----
- May
- April
- March
- February
- January
- ----- 2024 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2023 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2022 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2021 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2020 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2019 -----
- December
- 1 participants
- 17946 discussions

07 Feb '23
From: Dave Chinner <dchinner(a)redhat.com>
mainline inclusion
from mainline-v5.19-rc1
commit c230a4a85bcdbfc1a7415deec6caf04e8fca1301
category: bugfix
bugzilla: 187372, https://gitee.com/openeuler/kernel/issues/I5K0OM
CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
--------------------------------
Ever since we added shadown format buffers to the log items, log
items need to handle the item being released with shadow buffers
attached. Due to the fact this requirement was added at the same
time we added new rmap/reflink intents, we missed the cleanup of
those items.
In theory, this means shadow buffers can be leaked in a very small
window when a shutdown is initiated. Testing with KASAN shows this
leak does not happen in practice - we haven't identified a single
leak in several years of shutdown testing since ~v4.8 kernels.
However, the intent whiteout cleanup mechanism results in every
cancelled intent in exactly the same state as this tiny race window
creates and so if intents down clean up shadow buffers on final
release we will leak the shadow buffer for just about every intent
we create.
Hence we start with this patch to close this condition off and
ensure that when whiteouts start to be used we don't leak lots of
memory.
Signed-off-by: Dave Chinner <dchinner(a)redhat.com>
Reviewed-by: Darrick J. Wong <djwong(a)kernel.org>
Reviewed-by: Allison Henderson <allison.henderson(a)oracle.com>
Signed-off-by: Dave Chinner <david(a)fromorbit.com>
conflicts:
fs/xfs/xfs_bmap_item.c
fs/xfs/xfs_icreate_item.c
fs/xfs/xfs_refcount_item.c
fs/xfs/xfs_rmap_item.c
Signed-off-by: Li Nan <linan122(a)huawei.com>
Reviewed-by: Yang Erkun <yangerkun(a)huawei.com>
Reviewed-by: Zhang Yi <yi.zhang(a)huawei.com>
Signed-off-by: Jialin Zhang <zhangjialin11(a)huawei.com>
---
fs/xfs/xfs_bmap_item.c | 2 ++
fs/xfs/xfs_icreate_item.c | 1 +
fs/xfs/xfs_refcount_item.c | 2 ++
fs/xfs/xfs_rmap_item.c | 2 ++
4 files changed, 7 insertions(+)
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 44ec0f2d5253..e6de8081451f 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -40,6 +40,7 @@ STATIC void
xfs_bui_item_free(
struct xfs_bui_log_item *buip)
{
+ kmem_free(buip->bui_item.li_lv_shadow);
kmem_cache_free(xfs_bui_zone, buip);
}
@@ -199,6 +200,7 @@ xfs_bud_item_release(
struct xfs_bud_log_item *budp = BUD_ITEM(lip);
xfs_bui_release(budp->bud_buip);
+ kmem_free(budp->bud_item.li_lv_shadow);
kmem_cache_free(xfs_bud_zone, budp);
}
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 9b3994b9c716..aa8c7c261d24 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -63,6 +63,7 @@ STATIC void
xfs_icreate_item_release(
struct xfs_log_item *lip)
{
+ kmem_free(ICR_ITEM(lip)->ic_item.li_lv_shadow);
kmem_cache_free(xfs_icreate_zone, ICR_ITEM(lip));
}
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 0dee316283a9..9f4ff45c7a93 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -35,6 +35,7 @@ STATIC void
xfs_cui_item_free(
struct xfs_cui_log_item *cuip)
{
+ kmem_free(cuip->cui_item.li_lv_shadow);
if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS)
kmem_free(cuip);
else
@@ -204,6 +205,7 @@ xfs_cud_item_release(
struct xfs_cud_log_item *cudp = CUD_ITEM(lip);
xfs_cui_release(cudp->cud_cuip);
+ kmem_free(cudp->cud_item.li_lv_shadow);
kmem_cache_free(xfs_cud_zone, cudp);
}
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 20905953fe76..b5447ac7cb9b 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -35,6 +35,7 @@ STATIC void
xfs_rui_item_free(
struct xfs_rui_log_item *ruip)
{
+ kmem_free(ruip->rui_item.li_lv_shadow);
if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS)
kmem_free(ruip);
else
@@ -227,6 +228,7 @@ xfs_rud_item_release(
struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
xfs_rui_release(rudp->rud_ruip);
+ kmem_free(rudp->rud_item.li_lv_shadow);
kmem_cache_free(xfs_rud_zone, rudp);
}
--
2.25.1
1
9

07 Feb '23
From: Dave Chinner <dchinner(a)redhat.com>
mainline inclusion
from mainline-v5.19-rc1
commit c230a4a85bcdbfc1a7415deec6caf04e8fca1301
category: bugfix
bugzilla: 187372, https://gitee.com/openeuler/kernel/issues/I5K0OM
CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
--------------------------------
Ever since we added shadown format buffers to the log items, log
items need to handle the item being released with shadow buffers
attached. Due to the fact this requirement was added at the same
time we added new rmap/reflink intents, we missed the cleanup of
those items.
In theory, this means shadow buffers can be leaked in a very small
window when a shutdown is initiated. Testing with KASAN shows this
leak does not happen in practice - we haven't identified a single
leak in several years of shutdown testing since ~v4.8 kernels.
However, the intent whiteout cleanup mechanism results in every
cancelled intent in exactly the same state as this tiny race window
creates and so if intents down clean up shadow buffers on final
release we will leak the shadow buffer for just about every intent
we create.
Hence we start with this patch to close this condition off and
ensure that when whiteouts start to be used we don't leak lots of
memory.
Signed-off-by: Dave Chinner <dchinner(a)redhat.com>
Reviewed-by: Darrick J. Wong <djwong(a)kernel.org>
Reviewed-by: Allison Henderson <allison.henderson(a)oracle.com>
Signed-off-by: Dave Chinner <david(a)fromorbit.com>
conflicts:
fs/xfs/xfs_bmap_item.c
fs/xfs/xfs_icreate_item.c
fs/xfs/xfs_refcount_item.c
fs/xfs/xfs_rmap_item.c
Signed-off-by: Li Nan <linan122(a)huawei.com>
Reviewed-by: Yang Erkun <yangerkun(a)huawei.com>
Reviewed-by: Zhang Yi <yi.zhang(a)huawei.com>
Signed-off-by: Jialin Zhang <zhangjialin11(a)huawei.com>
---
fs/xfs/xfs_bmap_item.c | 2 ++
fs/xfs/xfs_icreate_item.c | 1 +
fs/xfs/xfs_refcount_item.c | 2 ++
fs/xfs/xfs_rmap_item.c | 2 ++
4 files changed, 7 insertions(+)
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 44ec0f2d5253..e6de8081451f 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -40,6 +40,7 @@ STATIC void
xfs_bui_item_free(
struct xfs_bui_log_item *buip)
{
+ kmem_free(buip->bui_item.li_lv_shadow);
kmem_cache_free(xfs_bui_zone, buip);
}
@@ -199,6 +200,7 @@ xfs_bud_item_release(
struct xfs_bud_log_item *budp = BUD_ITEM(lip);
xfs_bui_release(budp->bud_buip);
+ kmem_free(budp->bud_item.li_lv_shadow);
kmem_cache_free(xfs_bud_zone, budp);
}
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 9b3994b9c716..aa8c7c261d24 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -63,6 +63,7 @@ STATIC void
xfs_icreate_item_release(
struct xfs_log_item *lip)
{
+ kmem_free(ICR_ITEM(lip)->ic_item.li_lv_shadow);
kmem_cache_free(xfs_icreate_zone, ICR_ITEM(lip));
}
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 0dee316283a9..9f4ff45c7a93 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -35,6 +35,7 @@ STATIC void
xfs_cui_item_free(
struct xfs_cui_log_item *cuip)
{
+ kmem_free(cuip->cui_item.li_lv_shadow);
if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS)
kmem_free(cuip);
else
@@ -204,6 +205,7 @@ xfs_cud_item_release(
struct xfs_cud_log_item *cudp = CUD_ITEM(lip);
xfs_cui_release(cudp->cud_cuip);
+ kmem_free(cudp->cud_item.li_lv_shadow);
kmem_cache_free(xfs_cud_zone, cudp);
}
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 20905953fe76..b5447ac7cb9b 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -35,6 +35,7 @@ STATIC void
xfs_rui_item_free(
struct xfs_rui_log_item *ruip)
{
+ kmem_free(ruip->rui_item.li_lv_shadow);
if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS)
kmem_free(ruip);
else
@@ -227,6 +228,7 @@ xfs_rud_item_release(
struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
xfs_rui_release(rudp->rud_ruip);
+ kmem_free(rudp->rud_item.li_lv_shadow);
kmem_cache_free(xfs_rud_zone, rudp);
}
--
2.25.1
1
19

[PATCH openEuler-5.10-LTS-SP1 01/74] drm/msm/dsi: fix the inconsistent indenting
by Jialin Zhang 07 Feb '23
by Jialin Zhang 07 Feb '23
07 Feb '23
From: sunliming <sunliming(a)kylinos.cn>
stable inclusion
from stable-v5.10.142
commit 631fbefd877721d15c4f525cc71e851e0e588c8e
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I6CSFH
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id…
--------------------------------
[ Upstream commit 2f25a1fb4ec516c5ad67afd754334b491b9f09a5 ]
Fix the inconsistent indenting in function msm_dsi_dphy_timing_calc_v3().
Fix the following smatch warnings:
drivers/gpu/drm/msm/dsi/phy/dsi_phy.c:350 msm_dsi_dphy_timing_calc_v3() warn: inconsistent indenting
Fixes: f1fa7ff44056 ("drm/msm/dsi: implement auto PHY timing calculator for 10nm PHY")
Reported-by: kernel test robot <lkp(a)intel.com>
Signed-off-by: sunliming <sunliming(a)kylinos.cn>
Reviewed-by: Abhinav Kumar <quic_abhinavk(a)quicinc.com>
Patchwork: https://patchwork.freedesktop.org/patch/494662/
Link: https://lore.kernel.org/r/20220719015622.646718-1-sunliming@kylinos.cn
Signed-off-by: Abhinav Kumar <quic_abhinavk(a)quicinc.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
Signed-off-by: Jialin Zhang <zhangjialin11(a)huawei.com>
Reviewed-by: Zheng Zengkai <zhengzengkai(a)huawei.com>
---
drivers/gpu/drm/msm/dsi/phy/dsi_phy.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/msm/dsi/phy/dsi_phy.c b/drivers/gpu/drm/msm/dsi/phy/dsi_phy.c
index e07986ab52c2..2e0be85ec394 100644
--- a/drivers/gpu/drm/msm/dsi/phy/dsi_phy.c
+++ b/drivers/gpu/drm/msm/dsi/phy/dsi_phy.c
@@ -345,7 +345,7 @@ int msm_dsi_dphy_timing_calc_v3(struct msm_dsi_dphy_timing *timing,
} else {
timing->shared_timings.clk_pre =
linear_inter(tmax, tmin, pcnt2, 0, false);
- timing->shared_timings.clk_pre_inc_by_2 = 0;
+ timing->shared_timings.clk_pre_inc_by_2 = 0;
}
timing->ta_go = 3;
--
2.25.1
1
73

[PATCH openEuler-1.0-LTS 1/2] block: don't allow a disk link holder to itself
by Yongqiang Liu 07 Feb '23
by Yongqiang Liu 07 Feb '23
07 Feb '23
From: Yu Kuai <yukuai3(a)huawei.com>
mainline inclusion
from mainline-v6.2-rc1
commit 077a4033541fc96fb0a955985aab7d1f353da831
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I6B4N7
CVE: NA
--------------------------------
After creating a dm device, then user can reload such dm with itself,
and dead loop will be triggered because dm keep looking up to itself.
Test procedures:
1) dmsetup create test --table "xxx sda", assume dm-0 is created
2) dmsetup suspend test
3) dmsetup reload test --table "xxx dm-0"
4) dmsetup resume test
Test result:
BUG: TASK stack guard page was hit at 00000000736a261f (stack is 000000008d12c88d..00000000c8dd82d5)
stack guard page: 0000 [#1] PREEMPT SMP
CPU: 29 PID: 946 Comm: systemd-udevd Not tainted 6.1.0-rc3-next-20221101-00006-g17640ca3b0ee #1295
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014
RIP: 0010:dm_prepare_ioctl+0xf/0x1e0
Code: da 48 83 05 4a 7c 99 0b 01 41 89 c4 eb cd e8 b8 1f 40 00 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 41 57 48 83 05 a1 5a 99 0b 01 <41> 56 49 89 d6 41 55 4c 8d af 90 02 00 00 9
RSP: 0018:ffffc90002090000 EFLAGS: 00010206
RAX: ffff8881049d6800 RBX: ffff88817e589000 RCX: 0000000000000000
RDX: ffffc90002090010 RSI: ffffc9000209001c RDI: ffff88817e589000
RBP: 00000000484a101d R08: 0000000000000000 R09: 0000000000000007
R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000005331
R13: 0000000000005331 R14: 0000000000000000 R15: 0000000000000000
FS: 00007fddf9609200(0000) GS:ffff889fbfd40000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: ffffc9000208fff8 CR3: 0000000179043000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
<TASK>
dm_blk_ioctl+0x50/0x1c0
? dm_prepare_ioctl+0xe0/0x1e0
dm_blk_ioctl+0x88/0x1c0
dm_blk_ioctl+0x88/0x1c0
......(a lot of same lines)
dm_blk_ioctl+0x88/0x1c0
dm_blk_ioctl+0x88/0x1c0
blkdev_ioctl+0x184/0x3e0
__x64_sys_ioctl+0xa3/0x110
do_syscall_64+0x35/0x80
entry_SYSCALL_64_after_hwframe+0x63/0xcd
RIP: 0033:0x7fddf7306577
Code: b3 66 90 48 8b 05 11 89 2c 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d e1 88 8
RSP: 002b:00007ffd0b2ec318 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
RAX: ffffffffffffffda RBX: 00005634ef478320 RCX: 00007fddf7306577
RDX: 0000000000000000 RSI: 0000000000005331 RDI: 0000000000000007
RBP: 0000000000000007 R08: 00005634ef4843e0 R09: 0000000000000080
R10: 00007fddf75cfb38 R11: 0000000000000246 R12: 00000000030d4000
R13: 0000000000000000 R14: 0000000000000000 R15: 00005634ef48b800
</TASK>
Modules linked in:
---[ end trace 0000000000000000 ]---
RIP: 0010:dm_prepare_ioctl+0xf/0x1e0
Code: da 48 83 05 4a 7c 99 0b 01 41 89 c4 eb cd e8 b8 1f 40 00 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 41 57 48 83 05 a1 5a 99 0b 01 <41> 56 49 89 d6 41 55 4c 8d af 90 02 00 00 9
RSP: 0018:ffffc90002090000 EFLAGS: 00010206
RAX: ffff8881049d6800 RBX: ffff88817e589000 RCX: 0000000000000000
RDX: ffffc90002090010 RSI: ffffc9000209001c RDI: ffff88817e589000
RBP: 00000000484a101d R08: 0000000000000000 R09: 0000000000000007
R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000005331
R13: 0000000000005331 R14: 0000000000000000 R15: 0000000000000000
FS: 00007fddf9609200(0000) GS:ffff889fbfd40000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: ffffc9000208fff8 CR3: 0000000179043000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Kernel panic - not syncing: Fatal exception in interrupt
Kernel Offset: disabled
---[ end Kernel panic - not syncing: Fatal exception in interrupt ]---
Fix the problem by forbidding a disk to create link to itself.
Signed-off-by: Yu Kuai <yukuai3(a)huawei.com>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
Link: https://lore.kernel.org/r/20221115141054.1051801-11-yukuai1@huaweicloud.com
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
Signed-off-by: Li Lingfeng <lilingfeng3(a)huawei.com>
Reviewed-by: Yang Erkun <yangerkun(a)huawei.com>
Reviewed-by: Zhang Yi <yi.zhang(a)huawei.com>
Signed-off-by: Yongqiang Liu <liuyongqiang13(a)huawei.com>
---
fs/block_dev.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 2680092c022d..6ba91b97753f 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1305,6 +1305,8 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
struct bd_holder_disk *holder;
int ret = 0;
+ if (bdev->bd_disk == disk)
+ return -EINVAL;
/*
* bdev could be deleted beneath us which would implicitly destroy
* the holder directory. Hold on to it.
--
2.25.1
1
1

[PATCH openEuler-1.0-LTS 1/3] ipv6: raw: Deduct extension header length in rawv6_push_pending_frames
by Yongqiang Liu 06 Feb '23
by Yongqiang Liu 06 Feb '23
06 Feb '23
From: Herbert Xu <herbert(a)gondor.apana.org.au>
stable inclusion
from stable-v4.19.270
commit f487d636e49bc1fdfbd8105bc1ab159164e2d8bd
category: bugfix
bugzilla: 188291, https://gitee.com/src-openeuler/kernel/issues/I6B1V2
CVE: CVE-2023-0394
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=…
--------------------------------
commit cb3e9864cdbe35ff6378966660edbcbac955fe17 upstream.
The total cork length created by ip6_append_data includes extension
headers, so we must exclude them when comparing them against the
IPV6_CHECKSUM offset which does not include extension headers.
Reported-by: Kyle Zeng <zengyhkyle(a)gmail.com>
Fixes: 357b40a18b04 ("[IPV6]: IPV6_CHECKSUM socket option can corrupt kernel memory")
Signed-off-by: Herbert Xu <herbert(a)gondor.apana.org.au>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Signed-off-by: Lu Wei <luwei32(a)huawei.com>
Reviewed-by: Yue Haibing <yuehaibing(a)huawei.com>
Reviewed-by: Xiu Jianfeng <xiujianfeng(a)huawei.com>
Signed-off-by: Yongqiang Liu <liuyongqiang13(a)huawei.com>
---
net/ipv6/raw.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 412d4f60a408..8ce6414edd88 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -543,6 +543,7 @@ static int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
struct raw6_sock *rp)
{
+ struct ipv6_txoptions *opt;
struct sk_buff *skb;
int err = 0;
int offset;
@@ -560,6 +561,9 @@ static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
offset = rp->offset;
total_len = inet_sk(sk)->cork.base.length;
+ opt = inet6_sk(sk)->cork.opt;
+ total_len -= opt ? opt->opt_flen : 0;
+
if (offset >= total_len - 1) {
err = -EINVAL;
ip6_flush_pending_frames(sk);
--
2.25.1
1
2

[openEuler-5.10 1/4] SUNRPC: Clean up the handling of page padding in rpc_prepare_reply_pages()
by Zheng Zengkai 06 Feb '23
by Zheng Zengkai 06 Feb '23
06 Feb '23
From: Trond Myklebust <trond.myklebust(a)hammerspace.com>
mainline inclusion
from mainline-v5.11-rc1
commit 9ed5af268e88f6e5b65376be98d652b37cb20d7b
bugzilla: https://gitee.com/openeuler/kernel/issues/I6D0MJ
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
--------------------------------
rpc_prepare_reply_pages() currently expects the 'hdrsize' argument to
contain the length of the data that we expect to want placed in the head
kvec plus a count of 1 word of padding that is placed after the page data.
This is very confusing when trying to read the code, and sometimes leads
to callers adding an arbitrary value of '1' just in order to satisfy the
requirement (whether or not the page data actually needs such padding).
This patch aims to clarify the code by changing the 'hdrsize' argument
to remove that 1 word of padding. This means we need to subtract the
padding from all the existing callers.
Fixes: 02ef04e432ba ("NFS: Account for XDR pad of buf->pages")
Signed-off-by: Trond Myklebust <trond.myklebust(a)hammerspace.com>
Signed-off-by: Zheng Zengkai <zhengzengkai(a)huawei.com>
---
fs/nfs/nfs2xdr.c | 19 ++++++++++---------
fs/nfs/nfs3xdr.c | 29 ++++++++++++++++-------------
fs/nfs/nfs4xdr.c | 36 +++++++++++++++++++-----------------
net/sunrpc/clnt.c | 5 +----
net/sunrpc/xdr.c | 3 ---
5 files changed, 46 insertions(+), 46 deletions(-)
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 5e6453e9b307..3d5ba43f44bb 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -34,6 +34,7 @@
* Declare the space requirements for NFS arguments and replies as
* number of 32bit-words
*/
+#define NFS_pagepad_sz (1) /* Page padding */
#define NFS_fhandle_sz (8)
#define NFS_sattr_sz (8)
#define NFS_filename_sz (1+(NFS2_MAXNAMLEN>>2))
@@ -56,11 +57,11 @@
#define NFS_attrstat_sz (1+NFS_fattr_sz)
#define NFS_diropres_sz (1+NFS_fhandle_sz+NFS_fattr_sz)
-#define NFS_readlinkres_sz (2+1)
-#define NFS_readres_sz (1+NFS_fattr_sz+1+1)
+#define NFS_readlinkres_sz (2+NFS_pagepad_sz)
+#define NFS_readres_sz (1+NFS_fattr_sz+1+NFS_pagepad_sz)
#define NFS_writeres_sz (NFS_attrstat_sz)
#define NFS_stat_sz (1)
-#define NFS_readdirres_sz (1+1)
+#define NFS_readdirres_sz (1+NFS_pagepad_sz)
#define NFS_statfsres_sz (1+NFS_info_sz)
static int nfs_stat_to_errno(enum nfs_stat);
@@ -592,8 +593,8 @@ static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req,
const struct nfs_readlinkargs *args = data;
encode_fhandle(xdr, args->fh);
- rpc_prepare_reply_pages(req, args->pages, args->pgbase,
- args->pglen, NFS_readlinkres_sz);
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase, args->pglen,
+ NFS_readlinkres_sz - NFS_pagepad_sz);
}
/*
@@ -628,8 +629,8 @@ static void nfs2_xdr_enc_readargs(struct rpc_rqst *req,
const struct nfs_pgio_args *args = data;
encode_readargs(xdr, args);
- rpc_prepare_reply_pages(req, args->pages, args->pgbase,
- args->count, NFS_readres_sz);
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase, args->count,
+ NFS_readres_sz - NFS_pagepad_sz);
req->rq_rcv_buf.flags |= XDRBUF_READ;
}
@@ -786,8 +787,8 @@ static void nfs2_xdr_enc_readdirargs(struct rpc_rqst *req,
const struct nfs_readdirargs *args = data;
encode_readdirargs(xdr, args);
- rpc_prepare_reply_pages(req, args->pages, 0,
- args->count, NFS_readdirres_sz);
+ rpc_prepare_reply_pages(req, args->pages, 0, args->count,
+ NFS_readdirres_sz - NFS_pagepad_sz);
}
/*
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index b5a9379b1450..2c0cad7c51d6 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -33,6 +33,7 @@
* Declare the space requirements for NFS arguments and replies as
* number of 32bit-words
*/
+#define NFS3_pagepad_sz (1) /* Page padding */
#define NFS3_fhandle_sz (1+16)
#define NFS3_fh_sz (NFS3_fhandle_sz) /* shorthand */
#define NFS3_post_op_fh_sz (1+NFS3_fh_sz)
@@ -70,13 +71,13 @@
#define NFS3_removeres_sz (NFS3_setattrres_sz)
#define NFS3_lookupres_sz (1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz))
#define NFS3_accessres_sz (1+NFS3_post_op_attr_sz+1)
-#define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1+1)
-#define NFS3_readres_sz (1+NFS3_post_op_attr_sz+3+1)
+#define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1+NFS3_pagepad_sz)
+#define NFS3_readres_sz (1+NFS3_post_op_attr_sz+3+NFS3_pagepad_sz)
#define NFS3_writeres_sz (1+NFS3_wcc_data_sz+4)
#define NFS3_createres_sz (1+NFS3_post_op_fh_sz+NFS3_post_op_attr_sz+NFS3_wcc_data_sz)
#define NFS3_renameres_sz (1+(2 * NFS3_wcc_data_sz))
#define NFS3_linkres_sz (1+NFS3_post_op_attr_sz+NFS3_wcc_data_sz)
-#define NFS3_readdirres_sz (1+NFS3_post_op_attr_sz+2+1)
+#define NFS3_readdirres_sz (1+NFS3_post_op_attr_sz+2+NFS3_pagepad_sz)
#define NFS3_fsstatres_sz (1+NFS3_post_op_attr_sz+13)
#define NFS3_fsinfores_sz (1+NFS3_post_op_attr_sz+12)
#define NFS3_pathconfres_sz (1+NFS3_post_op_attr_sz+6)
@@ -86,7 +87,8 @@
#define ACL3_setaclargs_sz (NFS3_fh_sz+1+ \
XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE))
#define ACL3_getaclres_sz (1+NFS3_post_op_attr_sz+1+ \
- XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)+1)
+ XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)+\
+ NFS3_pagepad_sz)
#define ACL3_setaclres_sz (1+NFS3_post_op_attr_sz)
static int nfs3_stat_to_errno(enum nfs_stat);
@@ -910,8 +912,8 @@ static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req,
const struct nfs3_readlinkargs *args = data;
encode_nfs_fh3(xdr, args->fh);
- rpc_prepare_reply_pages(req, args->pages, args->pgbase,
- args->pglen, NFS3_readlinkres_sz);
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase, args->pglen,
+ NFS3_readlinkres_sz - NFS3_pagepad_sz);
}
/*
@@ -940,7 +942,8 @@ static void nfs3_xdr_enc_read3args(struct rpc_rqst *req,
const void *data)
{
const struct nfs_pgio_args *args = data;
- unsigned int replen = args->replen ? args->replen : NFS3_readres_sz;
+ unsigned int replen = args->replen ? args->replen :
+ NFS3_readres_sz - NFS3_pagepad_sz;
encode_read3args(xdr, args);
rpc_prepare_reply_pages(req, args->pages, args->pgbase,
@@ -1240,8 +1243,8 @@ static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req,
const struct nfs3_readdirargs *args = data;
encode_readdir3args(xdr, args);
- rpc_prepare_reply_pages(req, args->pages, 0,
- args->count, NFS3_readdirres_sz);
+ rpc_prepare_reply_pages(req, args->pages, 0, args->count,
+ NFS3_readdirres_sz - NFS3_pagepad_sz);
}
/*
@@ -1282,8 +1285,8 @@ static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req,
const struct nfs3_readdirargs *args = data;
encode_readdirplus3args(xdr, args);
- rpc_prepare_reply_pages(req, args->pages, 0,
- args->count, NFS3_readdirres_sz);
+ rpc_prepare_reply_pages(req, args->pages, 0, args->count,
+ NFS3_readdirres_sz - NFS3_pagepad_sz);
}
/*
@@ -1329,7 +1332,7 @@ static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req,
if (args->mask & (NFS_ACL | NFS_DFACL)) {
rpc_prepare_reply_pages(req, args->pages, 0,
NFSACL_MAXPAGES << PAGE_SHIFT,
- ACL3_getaclres_sz);
+ ACL3_getaclres_sz - NFS3_pagepad_sz);
req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES;
}
}
@@ -1649,7 +1652,7 @@ static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
result->op_status = status;
if (status != NFS3_OK)
goto out_status;
- result->replen = 4 + ((xdr_stream_pos(xdr) - pos) >> 2);
+ result->replen = 3 + ((xdr_stream_pos(xdr) - pos) >> 2);
error = decode_read3resok(xdr, result);
out:
return error;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index e2f0e3446e22..f64b28d5bc22 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -84,6 +84,7 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
/* lock,open owner id:
* we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT >> 2)
*/
+#define pagepad_maxsz (1)
#define open_owner_id_maxsz (1 + 2 + 1 + 1 + 2)
#define lock_owner_id_maxsz (1 + 1 + 4)
#define decode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
@@ -215,14 +216,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
nfs4_fattr_bitmap_maxsz)
#define encode_read_maxsz (op_encode_hdr_maxsz + \
encode_stateid_maxsz + 3)
-#define decode_read_maxsz (op_decode_hdr_maxsz + 2 + 1)
+#define decode_read_maxsz (op_decode_hdr_maxsz + 2 + pagepad_maxsz)
#define encode_readdir_maxsz (op_encode_hdr_maxsz + \
2 + encode_verifier_maxsz + 5 + \
nfs4_label_maxsz)
#define decode_readdir_maxsz (op_decode_hdr_maxsz + \
- decode_verifier_maxsz + 1)
+ decode_verifier_maxsz + pagepad_maxsz)
#define encode_readlink_maxsz (op_encode_hdr_maxsz)
-#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1 + 1)
+#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1 + pagepad_maxsz)
#define encode_write_maxsz (op_encode_hdr_maxsz + \
encode_stateid_maxsz + 4)
#define decode_write_maxsz (op_decode_hdr_maxsz + \
@@ -284,14 +285,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
#define decode_delegreturn_maxsz (op_decode_hdr_maxsz)
#define encode_getacl_maxsz (encode_getattr_maxsz)
#define decode_getacl_maxsz (op_decode_hdr_maxsz + \
- nfs4_fattr_bitmap_maxsz + 1 + 1)
+ nfs4_fattr_bitmap_maxsz + 1 + pagepad_maxsz)
#define encode_setacl_maxsz (op_encode_hdr_maxsz + \
encode_stateid_maxsz + 3)
#define decode_setacl_maxsz (decode_setattr_maxsz)
#define encode_fs_locations_maxsz \
(encode_getattr_maxsz)
#define decode_fs_locations_maxsz \
- (1)
+ (pagepad_maxsz)
#define encode_secinfo_maxsz (op_encode_hdr_maxsz + nfs4_name_maxsz)
#define decode_secinfo_maxsz (op_decode_hdr_maxsz + 1 + ((NFS_MAX_SECFLAVORS * (16 + GSS_OID_MAX_LEN)) / 4))
@@ -393,12 +394,13 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
/* devaddr4 payload is read into page */ \
1 /* notification bitmap length */ + \
1 /* notification bitmap, word 0 */ + \
- 1 /* possible XDR padding */)
+ pagepad_maxsz /* possible XDR padding */)
#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \
encode_stateid_maxsz)
#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \
decode_stateid_maxsz + \
- XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE) + 1)
+ XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE) + \
+ pagepad_maxsz)
#define encode_layoutcommit_maxsz (op_encode_hdr_maxsz + \
2 /* offset */ + \
2 /* length */ + \
@@ -2342,7 +2344,7 @@ static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_layoutget(xdr, args->lg_args, &hdr);
rpc_prepare_reply_pages(req, args->lg_args->layout.pages, 0,
args->lg_args->layout.pglen,
- hdr.replen);
+ hdr.replen - pagepad_maxsz);
}
encode_nops(&hdr);
}
@@ -2388,7 +2390,7 @@ static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req,
encode_layoutget(xdr, args->lg_args, &hdr);
rpc_prepare_reply_pages(req, args->lg_args->layout.pages, 0,
args->lg_args->layout.pglen,
- hdr.replen);
+ hdr.replen - pagepad_maxsz);
}
encode_nops(&hdr);
}
@@ -2499,7 +2501,7 @@ static void nfs4_xdr_enc_readlink(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_readlink(xdr, args, req, &hdr);
rpc_prepare_reply_pages(req, args->pages, args->pgbase,
- args->pglen, hdr.replen);
+ args->pglen, hdr.replen - pagepad_maxsz);
encode_nops(&hdr);
}
@@ -2520,7 +2522,7 @@ static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_readdir(xdr, args, req, &hdr);
rpc_prepare_reply_pages(req, args->pages, args->pgbase,
- args->count, hdr.replen);
+ args->count, hdr.replen - pagepad_maxsz);
encode_nops(&hdr);
}
@@ -2541,7 +2543,7 @@ static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_read(xdr, args, &hdr);
rpc_prepare_reply_pages(req, args->pages, args->pgbase,
- args->count, hdr.replen);
+ args->count, hdr.replen - pagepad_maxsz);
req->rq_rcv_buf.flags |= XDRBUF_READ;
encode_nops(&hdr);
}
@@ -2588,7 +2590,7 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
ARRAY_SIZE(nfs4_acl_bitmap), &hdr);
rpc_prepare_reply_pages(req, args->acl_pages, 0,
- args->acl_len, replen + 1);
+ args->acl_len, replen);
encode_nops(&hdr);
}
@@ -2810,7 +2812,7 @@ static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req,
}
rpc_prepare_reply_pages(req, (struct page **)&args->page, 0,
- PAGE_SIZE, replen + 1);
+ PAGE_SIZE, replen);
encode_nops(&hdr);
}
@@ -3014,14 +3016,14 @@ static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
encode_compound_hdr(xdr, req, &hdr);
encode_sequence(xdr, &args->seq_args, &hdr);
- replen = hdr.replen + op_decode_hdr_maxsz;
+ replen = hdr.replen + op_decode_hdr_maxsz + 2;
encode_getdeviceinfo(xdr, args, &hdr);
/* set up reply kvec. device_addr4 opaque data is read into the
* pages */
rpc_prepare_reply_pages(req, args->pdev->pages, args->pdev->pgbase,
- args->pdev->pglen, replen + 2 + 1);
+ args->pdev->pglen, replen);
encode_nops(&hdr);
}
@@ -3043,7 +3045,7 @@ static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
encode_layoutget(xdr, args, &hdr);
rpc_prepare_reply_pages(req, args->layout.pages, 0,
- args->layout.pglen, hdr.replen);
+ args->layout.pglen, hdr.replen - pagepad_maxsz);
encode_nops(&hdr);
}
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 78c6648af782..88a4dff14666 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1251,10 +1251,7 @@ void rpc_prepare_reply_pages(struct rpc_rqst *req, struct page **pages,
unsigned int base, unsigned int len,
unsigned int hdrsize)
{
- /* Subtract one to force an extra word of buffer space for the
- * payload's XDR pad to fall into the rcv_buf's tail iovec.
- */
- hdrsize += RPC_REPHDRSIZE + req->rq_cred->cr_auth->au_ralign - 1;
+ hdrsize += RPC_REPHDRSIZE + req->rq_cred->cr_auth->au_ralign;
xdr_inline_pages(&req->rq_rcv_buf, hdrsize << 2, pages, base, len);
trace_rpc_xdr_reply_pages(req->rq_task, &req->rq_rcv_buf);
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index d84bb5037bb5..b1684b19b6d5 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -193,9 +193,6 @@ xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset,
tail->iov_base = buf + offset;
tail->iov_len = buflen - offset;
- if ((xdr->page_len & 3) == 0)
- tail->iov_len -= sizeof(__be32);
-
xdr->buflen += len;
}
EXPORT_SYMBOL_GPL(xdr_inline_pages);
--
2.20.1
1
3

06 Feb '23
driver inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I6BSMN
-----------------------------------------------------------------------
For ROH distributed scenario, EID is allocated by DHCP mode.
Driver needs to convert the origin MAC address to EID format,
and updates the destination MAC, chaddr and client id(if exists)
when transmit DHCP packets. Meantime, the chaddr field should
follow the source mac address, in order to make the dhcp
server reply to the right client. For the payload of
dhcp packet changed, so the checksum of L4 should be
calculated too.
Signed-off-by: Jian Shen <shenjian15(a)huawei.com>
Signed-off-by: Ke Chen <chenke54(a)huawei.com>
---
.../net/ethernet/hisilicon/hns3/hns3_enet.c | 172 +++++++++++++++++-
.../net/ethernet/hisilicon/hns3/hns3_enet.h | 50 +++++
.../hisilicon/hns3/hns3pf/hclge_main.c | 9 +
3 files changed, 226 insertions(+), 5 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index cf79cd69c766..460f9d217a18 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -1165,6 +1165,142 @@ static void hns3_tx_spare_reclaim_cb(struct hns3_enet_ring *ring,
}
}
+static struct hns3_dhcp_packet *hns3_get_dhcp_packet(struct sk_buff *skb,
+ int *dhcp_len)
+{
+ struct hns3_dhcp_packet *dhcp;
+ union l4_hdr_info l4;
+ int l4_payload_len;
+
+ l4.hdr = skb_transport_header(skb);
+ if (l4.udp->dest != htons(HNS3_DHCP_CLIENT_PORT) ||
+ l4.udp->source != htons(HNS3_DHCP_SERVER_PORT))
+ return NULL;
+
+ dhcp = (struct hns3_dhcp_packet *)(l4.hdr + sizeof(struct udphdr));
+ l4_payload_len = ntohs(l4.udp->len) - sizeof(struct udphdr);
+ if (l4_payload_len < offsetof(struct hns3_dhcp_packet, options) ||
+ dhcp->hlen != ETH_ALEN ||
+ dhcp->cookie != htonl(HNS3_DHCP_MAGIC))
+ return NULL;
+
+ *dhcp_len = l4_payload_len;
+ return dhcp;
+}
+
+static u8 *hns3_dhcp_option_scan(struct hns3_dhcp_packet *packet,
+ struct hns3_dhcp_opt_state *opt_state)
+{
+ int opt_len;
+ u8 *cur_opt;
+
+ /* option bytes: [code][len][data0~data[len-1]] */
+ while (opt_state->rem > 0) {
+ switch (opt_state->opt_ptr[DHCP_OPT_CODE]) {
+ /* option padding and end have no len and data byte. */
+ case DHCP_OPT_PADDING:
+ opt_state->rem--;
+ opt_state->opt_ptr++;
+ break;
+ case DHCP_OPT_END:
+ if (DHCP_OVERLOAD_USE_FILE(opt_state->overload_flag)) {
+ opt_state->overload_flag |=
+ DHCP_OVERLOAD_FILE_USED;
+ opt_state->opt_ptr = packet->file;
+ opt_state->rem = sizeof(packet->file);
+ break;
+ }
+ if (DHCP_OVERLOAD_USE_SNAME(opt_state->overload_flag)) {
+ opt_state->overload_flag |=
+ DHCP_OVERLOAD_SNAME_USED;
+ opt_state->opt_ptr = packet->sname;
+ opt_state->rem = sizeof(packet->sname);
+ break;
+ }
+ return NULL;
+ default:
+ if (opt_state->rem <= DHCP_OPT_LEN)
+ return NULL;
+ /* opt_len includes code, len and data bytes */
+ opt_len = opt_state->opt_ptr[DHCP_OPT_LEN] +
+ DHCP_OPT_DATA;
+ cur_opt = opt_state->opt_ptr;
+ if (opt_state->rem < opt_len)
+ return NULL;
+
+ opt_state->opt_ptr += opt_len;
+ opt_state->rem -= opt_len;
+ if (cur_opt[DHCP_OPT_CODE] == DHCP_OPT_OVERLOAD) {
+ opt_state->overload_flag |=
+ cur_opt[DHCP_OPT_DATA];
+ break;
+ }
+ return cur_opt;
+ }
+ }
+
+ return NULL;
+}
+
+static void hns3_dhcp_update_option61(struct hns3_nic_priv *priv,
+ struct hns3_dhcp_packet *packet,
+ int dhcp_len)
+{
+ struct hns3_dhcp_opt_state opt_state;
+ u8 *cur_opt;
+
+ opt_state.opt_ptr = packet->options;
+ opt_state.rem = dhcp_len - offsetof(struct hns3_dhcp_packet, options);
+ opt_state.overload_flag = 0;
+
+ cur_opt = hns3_dhcp_option_scan(packet, &opt_state);
+ while (cur_opt) {
+ if (cur_opt[DHCP_OPT_CODE] != DHCP_OPT_CLIENT_ID) {
+ cur_opt = hns3_dhcp_option_scan(packet, &opt_state);
+ continue;
+ }
+ if (cur_opt[DHCP_OPT_LEN] > ETH_ALEN)
+ ether_addr_copy(&cur_opt[DHCP_CLIENT_ID_MAC_OFT],
+ priv->roh_perm_mac);
+ break;
+ }
+}
+
+static void hns3_dhcp_cal_l4_csum(struct sk_buff *skb)
+{
+ union l3_hdr_info l3;
+ union l4_hdr_info l4;
+ __wsum csum = 0;
+ int offset;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ return;
+
+ l3.hdr = skb_network_header(skb);
+ l4.hdr = skb_transport_header(skb);
+ offset = skb_transport_offset(skb);
+ l4.udp->check = 0;
+ csum = csum_partial(l4.udp, ntohs(l4.udp->len), 0);
+ l4.udp->check = csum_tcpudp_magic(l3.v4->saddr, l3.v4->daddr,
+ skb->len - offset, IPPROTO_UDP, csum);
+}
+
+static void hns3_dhcp_packet_convert(struct hns3_nic_priv *priv,
+ struct sk_buff *skb,
+ struct hns3_dhcp_packet *dhcp,
+ int dhcp_len)
+{
+ struct ethhdr *l2hdr = eth_hdr(skb);
+
+ if (!dhcp)
+ return;
+
+ ether_addr_copy(dhcp->chaddr, l2hdr->h_source);
+ hns3_dhcp_update_option61(priv, dhcp, dhcp_len);
+ /* for l4 payload changed, need to re-calculate the csum */
+ hns3_dhcp_cal_l4_csum(skb);
+}
+
static int hns3_set_tso(struct sk_buff *skb, u32 *paylen_fdop_ol4cs,
u16 *mss, u32 *type_cs_vlan_tso, u32 *send_bytes)
{
@@ -1716,7 +1852,20 @@ static int hns3_handle_csum_partial(struct hns3_enet_ring *ring,
return 0;
}
-static int hns3_fill_skb_desc(struct hns3_enet_ring *ring,
+static bool hns3_roh_check_udpv4(struct sk_buff *skb)
+{
+ union l3_hdr_info l3;
+
+ l3.hdr = skb_network_header(skb);
+ if (skb->protocol != htons(ETH_P_IP) ||
+ l3.v4->version != IP_VERSION_IPV4)
+ return false;
+
+ return l3.v4->protocol == IPPROTO_UDP;
+}
+
+static int hns3_fill_skb_desc(struct hns3_nic_priv *priv,
+ struct hns3_enet_ring *ring,
struct sk_buff *skb, struct hns3_desc *desc,
struct hns3_desc_cb *desc_cb)
{
@@ -1741,6 +1890,14 @@ static int hns3_fill_skb_desc(struct hns3_enet_ring *ring,
hnae3_set_field(param.paylen_fdop_ol4cs, HNS3_TXD_FD_OP_M,
HNS3_TXD_FD_OP_S, fd_op);
+ if (hns3_roh_check_udpv4(skb)) {
+ struct hns3_dhcp_packet *dhcp;
+ int dhcp_len;
+
+ dhcp = hns3_get_dhcp_packet(skb, &dhcp_len);
+ hns3_dhcp_packet_convert(priv, skb, dhcp, dhcp_len);
+ }
+
/* Set txbd */
desc->tx.ol_type_vlan_len_msec =
cpu_to_le32(param.ol_type_vlan_len_msec);
@@ -2338,15 +2495,16 @@ static int hns3_handle_desc_filling(struct hns3_enet_ring *ring,
return hns3_fill_skb_to_desc(ring, skb, DESC_TYPE_SKB);
}
-static int hns3_handle_skb_desc(struct hns3_enet_ring *ring,
+static int hns3_handle_skb_desc(struct hns3_nic_priv *priv,
+ struct hns3_enet_ring *ring,
struct sk_buff *skb,
struct hns3_desc_cb *desc_cb,
int next_to_use_head)
{
int ret;
- ret = hns3_fill_skb_desc(ring, skb, &ring->desc[ring->next_to_use],
- desc_cb);
+ ret = hns3_fill_skb_desc(priv, ring, skb,
+ &ring->desc[ring->next_to_use], desc_cb);
if (unlikely(ret < 0))
goto fill_err;
@@ -2395,7 +2553,7 @@ netdev_tx_t hns3_nic_net_xmit(struct sk_buff *skb, struct net_device *netdev)
goto out_err_tx_ok;
}
- ret = hns3_handle_skb_desc(ring, skb, desc_cb, ring->next_to_use);
+ ret = hns3_handle_skb_desc(priv, ring, skb, desc_cb, ring->next_to_use);
if (unlikely(ret <= 0))
goto out_err_tx_ok;
@@ -5226,6 +5384,9 @@ static int hns3_init_mac_addr(struct net_device *netdev)
return 0;
}
+ if (is_zero_ether_addr(priv->roh_perm_mac))
+ ether_addr_copy(priv->roh_perm_mac, netdev->dev_addr);
+
if (h->ae_algo->ops->set_mac_addr)
ret = h->ae_algo->ops->set_mac_addr(h, netdev->dev_addr, true);
@@ -5377,6 +5538,7 @@ static int hns3_client_init(struct hnae3_handle *handle)
priv->tx_timeout_count = 0;
priv->max_non_tso_bd_num = ae_dev->dev_specs.max_non_tso_bd_num;
set_bit(HNS3_NIC_STATE_DOWN, &priv->state);
+ eth_zero_addr(priv->roh_perm_mac);
handle->msg_enable = netif_msg_init(debug, DEFAULT_MSG_LEVEL);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
index ccfd38b0028e..85c352fff83b 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
@@ -604,6 +604,56 @@ struct hns3_nic_priv {
struct hns3_enet_coalesce rx_coal;
u32 tx_copybreak;
u32 rx_copybreak;
+ u8 roh_perm_mac[ETH_ALEN];
+};
+
+#define HNS3_DHCP_SERVER_PORT 68
+#define HNS3_DHCP_CLIENT_PORT 67
+#define HNS3_DHCP_MAGIC 0x63825363
+#define DHCP_OPT_CODE 0
+#define DHCP_OPT_LEN 1
+#define DHCP_OPT_DATA 2
+#define DHCP_CLIENT_ID_LEN 7
+#define DHCP_CLIENT_ID_MAC_OFT 3
+#define DHCP_OVERLOAD_FILE 0x1
+#define DHCP_OVERLOAD_SNAME 0x2
+#define DHCP_OVERLOAD_FILE_USED 0x101
+#define DHCP_OVERLOAD_SNAME_USED 0x202
+#define DHCP_OVERLOAD_USE_FILE(x) \
+ (((x) & DHCP_OVERLOAD_FILE_USED) == DHCP_OVERLOAD_FILE)
+#define DHCP_OVERLOAD_USE_SNAME(x) \
+ (((x) & DHCP_OVERLOAD_SNAME_USED) == DHCP_OVERLOAD_SNAME)
+
+enum DHCP_OPTION_CODES {
+ DHCP_OPT_PADDING = 0,
+ DHCP_OPT_OVERLOAD = 52,
+ DHCP_OPT_CLIENT_ID = 61,
+ DHCP_OPT_END = 255
+};
+
+struct hns3_dhcp_packet {
+ u8 op;
+ u8 htype;
+ u8 hlen;
+ u8 hops;
+ u32 xid;
+ u16 secs;
+ u16 flags;
+ u32 ciaddr;
+ u32 yiaddr;
+ u32 siaddr_nip;
+ u32 gateway_nip;
+ u8 chaddr[16]; /* link-layer client hardware address (MAC) */
+ u8 sname[64];
+ u8 file[128];
+ u32 cookie; /* DHCP magic bytes: 0x63825363 */
+ u8 options[312];
+};
+
+struct hns3_dhcp_opt_state {
+ u8 *opt_ptr; /* refer to current option item */
+ int rem; /* remain bytes in options */
+ u32 overload_flag; /* whether use file and sname field as options */
};
union l3_hdr_info {
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index eea17548416b..5c8a821aa61a 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -2866,12 +2866,21 @@ static void hclge_get_fec(struct hnae3_handle *handle, u8 *fec_ability,
if (fec_mode)
*fec_mode = mac->fec_mode;
}
+
+static void hclge_roh_convert_mac_addr(struct hclge_dev *hdev)
+{
+#define HCLGE_ROH_EID_MASK_BYTE 3
+
+ memset(hdev->hw.mac.mac_addr, 0, HCLGE_ROH_EID_MASK_BYTE);
+}
+
static int hclge_mac_init(struct hclge_dev *hdev)
{
struct hclge_mac *mac = &hdev->hw.mac;
int ret;
hclge_mac_type_init(hdev);
+ hclge_roh_convert_mac_addr(hdev);
hdev->support_sfp_query = true;
hdev->hw.mac.duplex = HCLGE_MAC_FULL;
--
2.30.0
1
0

06 Feb '23
From: Sebastian Andrzej Siewior <bigeasy(a)linutronix.de>
stable inclusion
from stable-v5.10.142
commit d71a1c9fce184718d1b3a51a9e8a6e31cbbb45ce
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I6D0ZE
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id…
-------------------------------------------------
commit 278d3ba61563ceed3cb248383ced19e14ec7bc1f upstream.
On 32bit-UP u64_stats_fetch_begin() disables only preemption. If the
reader is in preemptible context and the writer side
(u64_stats_update_begin*()) runs in an interrupt context (IRQ or
softirq) then the writer can update the stats during the read operation.
This update remains undetected.
Use u64_stats_fetch_begin_irq() to ensure the stats fetch on 32bit-UP
are not interrupted by a writer. 32bit-SMP remains unaffected by this
change.
Cc: "David S. Miller" <davem(a)davemloft.net>
Cc: Catherine Sullivan <csully(a)google.com>
Cc: David Awogbemila <awogbemila(a)google.com>
Cc: Dimitris Michailidis <dmichail(a)fungible.com>
Cc: Eric Dumazet <edumazet(a)google.com>
Cc: Hans Ulli Kroll <ulli.kroll(a)googlemail.com>
Cc: Jakub Kicinski <kuba(a)kernel.org>
Cc: Jeroen de Borst <jeroendb(a)google.com>
Cc: Johannes Berg <johannes(a)sipsolutions.net>
Cc: Linus Walleij <linus.walleij(a)linaro.org>
Cc: Paolo Abeni <pabeni(a)redhat.com>
Cc: Simon Horman <simon.horman(a)corigine.com>
Cc: linux-arm-kernel(a)lists.infradead.org
Cc: linux-wireless(a)vger.kernel.org
Cc: netdev(a)vger.kernel.org
Cc: oss-drivers(a)corigine.com
Cc: stable(a)vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy(a)linutronix.de>
Reviewed-by: Simon Horman <simon.horman(a)corigine.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
(cherry picked from commit d71a1c9fce184718d1b3a51a9e8a6e31cbbb45ce)
Signed-off-by: Wang Yufen <wangyufen(a)huawei.com>
Conflicts:
drivers/net/ethernet/huawei/hinic/hinic_rx.c
drivers/net/ethernet/huawei/hinic/hinic_tx.c
Signed-off-by: Wang Yufen <wangyufen(a)huawei.com>
---
drivers/net/ethernet/cortina/gemini.c | 24 +++++++++++-----------
drivers/net/ethernet/google/gve/gve_ethtool.c | 16 +++++++--------
drivers/net/ethernet/google/gve/gve_main.c | 12 +++++------
drivers/net/ethernet/huawei/hinic/hinic_rx.c | 4 ++--
drivers/net/ethernet/huawei/hinic/hinic_tx.c | 4 ++--
.../net/ethernet/netronome/nfp/nfp_net_common.c | 8 ++++----
.../net/ethernet/netronome/nfp/nfp_net_ethtool.c | 8 ++++----
drivers/net/netdevsim/netdev.c | 4 ++--
net/mac80211/sta_info.c | 8 ++++----
net/mpls/af_mpls.c | 4 ++--
10 files changed, 46 insertions(+), 46 deletions(-)
diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c
index 3685878..b22ea40 100644
--- a/drivers/net/ethernet/cortina/gemini.c
+++ b/drivers/net/ethernet/cortina/gemini.c
@@ -1920,7 +1920,7 @@ static void gmac_get_stats64(struct net_device *netdev,
/* Racing with RX NAPI */
do {
- start = u64_stats_fetch_begin(&port->rx_stats_syncp);
+ start = u64_stats_fetch_begin_irq(&port->rx_stats_syncp);
stats->rx_packets = port->stats.rx_packets;
stats->rx_bytes = port->stats.rx_bytes;
@@ -1932,11 +1932,11 @@ static void gmac_get_stats64(struct net_device *netdev,
stats->rx_crc_errors = port->stats.rx_crc_errors;
stats->rx_frame_errors = port->stats.rx_frame_errors;
- } while (u64_stats_fetch_retry(&port->rx_stats_syncp, start));
+ } while (u64_stats_fetch_retry_irq(&port->rx_stats_syncp, start));
/* Racing with MIB and TX completion interrupts */
do {
- start = u64_stats_fetch_begin(&port->ir_stats_syncp);
+ start = u64_stats_fetch_begin_irq(&port->ir_stats_syncp);
stats->tx_errors = port->stats.tx_errors;
stats->tx_packets = port->stats.tx_packets;
@@ -1946,15 +1946,15 @@ static void gmac_get_stats64(struct net_device *netdev,
stats->rx_missed_errors = port->stats.rx_missed_errors;
stats->rx_fifo_errors = port->stats.rx_fifo_errors;
- } while (u64_stats_fetch_retry(&port->ir_stats_syncp, start));
+ } while (u64_stats_fetch_retry_irq(&port->ir_stats_syncp, start));
/* Racing with hard_start_xmit */
do {
- start = u64_stats_fetch_begin(&port->tx_stats_syncp);
+ start = u64_stats_fetch_begin_irq(&port->tx_stats_syncp);
stats->tx_dropped = port->stats.tx_dropped;
- } while (u64_stats_fetch_retry(&port->tx_stats_syncp, start));
+ } while (u64_stats_fetch_retry_irq(&port->tx_stats_syncp, start));
stats->rx_dropped += stats->rx_missed_errors;
}
@@ -2032,18 +2032,18 @@ static void gmac_get_ethtool_stats(struct net_device *netdev,
/* Racing with MIB interrupt */
do {
p = values;
- start = u64_stats_fetch_begin(&port->ir_stats_syncp);
+ start = u64_stats_fetch_begin_irq(&port->ir_stats_syncp);
for (i = 0; i < RX_STATS_NUM; i++)
*p++ = port->hw_stats[i];
- } while (u64_stats_fetch_retry(&port->ir_stats_syncp, start));
+ } while (u64_stats_fetch_retry_irq(&port->ir_stats_syncp, start));
values = p;
/* Racing with RX NAPI */
do {
p = values;
- start = u64_stats_fetch_begin(&port->rx_stats_syncp);
+ start = u64_stats_fetch_begin_irq(&port->rx_stats_syncp);
for (i = 0; i < RX_STATUS_NUM; i++)
*p++ = port->rx_stats[i];
@@ -2051,13 +2051,13 @@ static void gmac_get_ethtool_stats(struct net_device *netdev,
*p++ = port->rx_csum_stats[i];
*p++ = port->rx_napi_exits;
- } while (u64_stats_fetch_retry(&port->rx_stats_syncp, start));
+ } while (u64_stats_fetch_retry_irq(&port->rx_stats_syncp, start));
values = p;
/* Racing with TX start_xmit */
do {
p = values;
- start = u64_stats_fetch_begin(&port->tx_stats_syncp);
+ start = u64_stats_fetch_begin_irq(&port->tx_stats_syncp);
for (i = 0; i < TX_MAX_FRAGS; i++) {
*values++ = port->tx_frag_stats[i];
@@ -2066,7 +2066,7 @@ static void gmac_get_ethtool_stats(struct net_device *netdev,
*values++ = port->tx_frags_linearized;
*values++ = port->tx_hw_csummed;
- } while (u64_stats_fetch_retry(&port->tx_stats_syncp, start));
+ } while (u64_stats_fetch_retry_irq(&port->tx_stats_syncp, start));
}
static int gmac_get_ksettings(struct net_device *netdev,
diff --git a/drivers/net/ethernet/google/gve/gve_ethtool.c b/drivers/net/ethernet/google/gve/gve_ethtool.c
index 66f9b37..80a8c0c 100644
--- a/drivers/net/ethernet/google/gve/gve_ethtool.c
+++ b/drivers/net/ethernet/google/gve/gve_ethtool.c
@@ -172,14 +172,14 @@ static int gve_get_sset_count(struct net_device *netdev, int sset)
struct gve_rx_ring *rx = &priv->rx[ring];
start =
- u64_stats_fetch_begin(&priv->rx[ring].statss);
+ u64_stats_fetch_begin_irq(&priv->rx[ring].statss);
tmp_rx_pkts = rx->rpackets;
tmp_rx_bytes = rx->rbytes;
tmp_rx_skb_alloc_fail = rx->rx_skb_alloc_fail;
tmp_rx_buf_alloc_fail = rx->rx_buf_alloc_fail;
tmp_rx_desc_err_dropped_pkt =
rx->rx_desc_err_dropped_pkt;
- } while (u64_stats_fetch_retry(&priv->rx[ring].statss,
+ } while (u64_stats_fetch_retry_irq(&priv->rx[ring].statss,
start));
rx_pkts += tmp_rx_pkts;
rx_bytes += tmp_rx_bytes;
@@ -193,10 +193,10 @@ static int gve_get_sset_count(struct net_device *netdev, int sset)
if (priv->tx) {
do {
start =
- u64_stats_fetch_begin(&priv->tx[ring].statss);
+ u64_stats_fetch_begin_irq(&priv->tx[ring].statss);
tmp_tx_pkts = priv->tx[ring].pkt_done;
tmp_tx_bytes = priv->tx[ring].bytes_done;
- } while (u64_stats_fetch_retry(&priv->tx[ring].statss,
+ } while (u64_stats_fetch_retry_irq(&priv->tx[ring].statss,
start));
tx_pkts += tmp_tx_pkts;
tx_bytes += tmp_tx_bytes;
@@ -254,13 +254,13 @@ static int gve_get_sset_count(struct net_device *netdev, int sset)
data[i++] = rx->cnt;
do {
start =
- u64_stats_fetch_begin(&priv->rx[ring].statss);
+ u64_stats_fetch_begin_irq(&priv->rx[ring].statss);
tmp_rx_bytes = rx->rbytes;
tmp_rx_skb_alloc_fail = rx->rx_skb_alloc_fail;
tmp_rx_buf_alloc_fail = rx->rx_buf_alloc_fail;
tmp_rx_desc_err_dropped_pkt =
rx->rx_desc_err_dropped_pkt;
- } while (u64_stats_fetch_retry(&priv->rx[ring].statss,
+ } while (u64_stats_fetch_retry_irq(&priv->rx[ring].statss,
start));
data[i++] = tmp_rx_bytes;
/* rx dropped packets */
@@ -313,9 +313,9 @@ static int gve_get_sset_count(struct net_device *netdev, int sset)
data[i++] = tx->done;
do {
start =
- u64_stats_fetch_begin(&priv->tx[ring].statss);
+ u64_stats_fetch_begin_irq(&priv->tx[ring].statss);
tmp_tx_bytes = tx->bytes_done;
- } while (u64_stats_fetch_retry(&priv->tx[ring].statss,
+ } while (u64_stats_fetch_retry_irq(&priv->tx[ring].statss,
start));
data[i++] = tmp_tx_bytes;
data[i++] = tx->wake_queue;
diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c
index 6cb75bb..f0c1e6c8 100644
--- a/drivers/net/ethernet/google/gve/gve_main.c
+++ b/drivers/net/ethernet/google/gve/gve_main.c
@@ -40,10 +40,10 @@ static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s)
for (ring = 0; ring < priv->rx_cfg.num_queues; ring++) {
do {
start =
- u64_stats_fetch_begin(&priv->rx[ring].statss);
+ u64_stats_fetch_begin_irq(&priv->rx[ring].statss);
packets = priv->rx[ring].rpackets;
bytes = priv->rx[ring].rbytes;
- } while (u64_stats_fetch_retry(&priv->rx[ring].statss,
+ } while (u64_stats_fetch_retry_irq(&priv->rx[ring].statss,
start));
s->rx_packets += packets;
s->rx_bytes += bytes;
@@ -53,10 +53,10 @@ static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s)
for (ring = 0; ring < priv->tx_cfg.num_queues; ring++) {
do {
start =
- u64_stats_fetch_begin(&priv->tx[ring].statss);
+ u64_stats_fetch_begin_irq(&priv->tx[ring].statss);
packets = priv->tx[ring].pkt_done;
bytes = priv->tx[ring].bytes_done;
- } while (u64_stats_fetch_retry(&priv->tx[ring].statss,
+ } while (u64_stats_fetch_retry_irq(&priv->tx[ring].statss,
start));
s->tx_packets += packets;
s->tx_bytes += bytes;
@@ -1041,9 +1041,9 @@ void gve_handle_report_stats(struct gve_priv *priv)
if (priv->tx) {
for (idx = 0; idx < priv->tx_cfg.num_queues; idx++) {
do {
- start = u64_stats_fetch_begin(&priv->tx[idx].statss);
+ start = u64_stats_fetch_begin_irq(&priv->tx[idx].statss);
tx_bytes = priv->tx[idx].bytes_done;
- } while (u64_stats_fetch_retry(&priv->tx[idx].statss, start));
+ } while (u64_stats_fetch_retry_irq(&priv->tx[idx].statss, start));
stats[stats_idx++] = (struct stats) {
.stat_name = cpu_to_be32(TX_WAKE_CNT),
.value = cpu_to_be64(priv->tx[idx].wake_queue),
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_rx.c b/drivers/net/ethernet/huawei/hinic/hinic_rx.c
index 57d5d79..1b57b67 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_rx.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_rx.c
@@ -375,7 +375,7 @@ void hinic_rxq_get_stats(struct hinic_rxq *rxq,
u64_stats_update_begin(&stats->syncp);
do {
- start = u64_stats_fetch_begin(&rxq_stats->syncp);
+ start = u64_stats_fetch_begin_irq(&rxq_stats->syncp);
stats->bytes = rxq_stats->bytes;
stats->packets = rxq_stats->packets;
stats->errors = rxq_stats->csum_errors +
@@ -384,7 +384,7 @@ void hinic_rxq_get_stats(struct hinic_rxq *rxq,
stats->other_errors = rxq_stats->other_errors;
stats->dropped = rxq_stats->dropped;
stats->rx_buf_empty = rxq_stats->rx_buf_empty;
- } while (u64_stats_fetch_retry(&rxq_stats->syncp, start));
+ } while (u64_stats_fetch_retry_irq(&rxq_stats->syncp, start));
u64_stats_update_end(&stats->syncp);
}
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_tx.c b/drivers/net/ethernet/huawei/hinic/hinic_tx.c
index 75fa344..ff37b6f 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_tx.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_tx.c
@@ -61,7 +61,7 @@ void hinic_txq_get_stats(struct hinic_txq *txq,
u64_stats_update_begin(&stats->syncp);
do {
- start = u64_stats_fetch_begin(&txq_stats->syncp);
+ start = u64_stats_fetch_begin_irq(&txq_stats->syncp);
stats->bytes = txq_stats->bytes;
stats->packets = txq_stats->packets;
stats->busy = txq_stats->busy;
@@ -69,7 +69,7 @@ void hinic_txq_get_stats(struct hinic_txq *txq,
stats->dropped = txq_stats->dropped;
stats->big_frags_pkts = txq_stats->big_frags_pkts;
stats->big_udp_pkts = txq_stats->big_udp_pkts;
- } while (u64_stats_fetch_retry(&txq_stats->syncp, start));
+ } while (u64_stats_fetch_retry_irq(&txq_stats->syncp, start));
u64_stats_update_end(&stats->syncp);
}
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index dfc1f32..5ab230aa 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -3373,21 +3373,21 @@ static void nfp_net_stat64(struct net_device *netdev,
unsigned int start;
do {
- start = u64_stats_fetch_begin(&r_vec->rx_sync);
+ start = u64_stats_fetch_begin_irq(&r_vec->rx_sync);
data[0] = r_vec->rx_pkts;
data[1] = r_vec->rx_bytes;
data[2] = r_vec->rx_drops;
- } while (u64_stats_fetch_retry(&r_vec->rx_sync, start));
+ } while (u64_stats_fetch_retry_irq(&r_vec->rx_sync, start));
stats->rx_packets += data[0];
stats->rx_bytes += data[1];
stats->rx_dropped += data[2];
do {
- start = u64_stats_fetch_begin(&r_vec->tx_sync);
+ start = u64_stats_fetch_begin_irq(&r_vec->tx_sync);
data[0] = r_vec->tx_pkts;
data[1] = r_vec->tx_bytes;
data[2] = r_vec->tx_errors;
- } while (u64_stats_fetch_retry(&r_vec->tx_sync, start));
+ } while (u64_stats_fetch_retry_irq(&r_vec->tx_sync, start));
stats->tx_packets += data[0];
stats->tx_bytes += data[1];
stats->tx_errors += data[2];
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
index bfcd90f..d4136d3 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
@@ -498,7 +498,7 @@ static u64 *nfp_vnic_get_sw_stats(struct net_device *netdev, u64 *data)
unsigned int start;
do {
- start = u64_stats_fetch_begin(&nn->r_vecs[i].rx_sync);
+ start = u64_stats_fetch_begin_irq(&nn->r_vecs[i].rx_sync);
data[0] = nn->r_vecs[i].rx_pkts;
tmp[0] = nn->r_vecs[i].hw_csum_rx_ok;
tmp[1] = nn->r_vecs[i].hw_csum_rx_inner_ok;
@@ -506,10 +506,10 @@ static u64 *nfp_vnic_get_sw_stats(struct net_device *netdev, u64 *data)
tmp[3] = nn->r_vecs[i].hw_csum_rx_error;
tmp[4] = nn->r_vecs[i].rx_replace_buf_alloc_fail;
tmp[5] = nn->r_vecs[i].hw_tls_rx;
- } while (u64_stats_fetch_retry(&nn->r_vecs[i].rx_sync, start));
+ } while (u64_stats_fetch_retry_irq(&nn->r_vecs[i].rx_sync, start));
do {
- start = u64_stats_fetch_begin(&nn->r_vecs[i].tx_sync);
+ start = u64_stats_fetch_begin_irq(&nn->r_vecs[i].tx_sync);
data[1] = nn->r_vecs[i].tx_pkts;
data[2] = nn->r_vecs[i].tx_busy;
tmp[6] = nn->r_vecs[i].hw_csum_tx;
@@ -519,7 +519,7 @@ static u64 *nfp_vnic_get_sw_stats(struct net_device *netdev, u64 *data)
tmp[10] = nn->r_vecs[i].hw_tls_tx;
tmp[11] = nn->r_vecs[i].tls_tx_fallback;
tmp[12] = nn->r_vecs[i].tls_tx_no_fallback;
- } while (u64_stats_fetch_retry(&nn->r_vecs[i].tx_sync, start));
+ } while (u64_stats_fetch_retry_irq(&nn->r_vecs[i].tx_sync, start));
data += NN_RVEC_PER_Q_STATS;
diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c
index ad6dbf01..4fb0638 100644
--- a/drivers/net/netdevsim/netdev.c
+++ b/drivers/net/netdevsim/netdev.c
@@ -67,10 +67,10 @@ static int nsim_change_mtu(struct net_device *dev, int new_mtu)
unsigned int start;
do {
- start = u64_stats_fetch_begin(&ns->syncp);
+ start = u64_stats_fetch_begin_irq(&ns->syncp);
stats->tx_bytes = ns->tx_bytes;
stats->tx_packets = ns->tx_packets;
- } while (u64_stats_fetch_retry(&ns->syncp, start));
+ } while (u64_stats_fetch_retry_irq(&ns->syncp, start));
}
static int
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 461c037..cee39ae 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -2175,9 +2175,9 @@ static inline u64 sta_get_tidstats_msdu(struct ieee80211_sta_rx_stats *rxstats,
u64 value;
do {
- start = u64_stats_fetch_begin(&rxstats->syncp);
+ start = u64_stats_fetch_begin_irq(&rxstats->syncp);
value = rxstats->msdu[tid];
- } while (u64_stats_fetch_retry(&rxstats->syncp, start));
+ } while (u64_stats_fetch_retry_irq(&rxstats->syncp, start));
return value;
}
@@ -2241,9 +2241,9 @@ static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats)
u64 value;
do {
- start = u64_stats_fetch_begin(&rxstats->syncp);
+ start = u64_stats_fetch_begin_irq(&rxstats->syncp);
value = rxstats->bytes;
- } while (u64_stats_fetch_retry(&rxstats->syncp, start));
+ } while (u64_stats_fetch_retry_irq(&rxstats->syncp, start));
return value;
}
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 9c047c1..7239814 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -1078,9 +1078,9 @@ static void mpls_get_stats(struct mpls_dev *mdev,
p = per_cpu_ptr(mdev->stats, i);
do {
- start = u64_stats_fetch_begin(&p->syncp);
+ start = u64_stats_fetch_begin_irq(&p->syncp);
local = p->stats;
- } while (u64_stats_fetch_retry(&p->syncp, start));
+ } while (u64_stats_fetch_retry_irq(&p->syncp, start));
stats->rx_packets += local.rx_packets;
stats->rx_bytes += local.rx_bytes;
--
1.8.3.1
1
2

[PATCH openEuler-1.0-LTS 1/3] mm: hwpoison: refactor refcount check handling
by Yongqiang Liu 04 Feb '23
by Yongqiang Liu 04 Feb '23
04 Feb '23
From: Yang Shi <shy828301(a)gmail.com>
stable inclusion
from stable-v5.15.86
commit a62b1bc603a1ded739e7cf543da29a3eb93cc534
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I6AR36
CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id…
--------------------------------
commit dd0f230a0a80ff396c7ce587f16429f2a8131344 upstream.
Memory failure will report failure if the page still has extra pinned
refcount other than from hwpoison after the handler is done. Actually
the check is not necessary for all handlers, so move the check into
specific handlers. This would make the following keeping shmem page in
page cache patch easier.
There may be expected extra pin for some cases, for example, when the
page is dirty and in swapcache.
Link: https://lkml.kernel.org/r/20211020210755.23964-5-shy828301@gmail.com
Signed-off-by: Yang Shi <shy828301(a)gmail.com>
Signed-off-by: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Suggested-by: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Oscar Salvador <osalvador(a)suse.de>
Cc: Peter Xu <peterx(a)redhat.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Naoya Horiguchi <naoya.horiguchi(a)linux.dev>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Signed-off-by: Ze Zuo <zuoze1(a)huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang(a)huawei.com>
Signed-off-by: Yongqiang Liu <liuyongqiang13(a)huawei.com>
---
mm/memory-failure.c | 93 +++++++++++++++++++++++++++++++--------------
1 file changed, 64 insertions(+), 29 deletions(-)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 9a816fdf812d..b653637d5a00 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -655,12 +655,44 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
return ret;
}
+struct page_state {
+ unsigned long mask;
+ unsigned long res;
+ enum mf_action_page_type type;
+
+ /* Callback ->action() has to unlock the relevant page inside it. */
+ int (*action)(struct page_state *ps, struct page *p);
+};
+
+/*
+ * Return true if page is still referenced by others, otherwise return
+ * false.
+ *
+ * The extra_pins is true when one extra refcount is expected.
+ */
+static bool has_extra_refcount(struct page_state *ps, struct page *p,
+ bool extra_pins)
+{
+ int count = page_count(p) - 1;
+
+ if (extra_pins)
+ count -= 1;
+
+ if (count > 0) {
+ pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
+ page_to_pfn(p), action_page_types[ps->type], count);
+ return true;
+ }
+
+ return false;
+}
+
/*
* Error hit kernel page.
* Do nothing, try to be lucky and not touch this instead. For a few cases we
* could be more sophisticated.
*/
-static int me_kernel(struct page *p, unsigned long pfn)
+static int me_kernel(struct page_state *ps, struct page *p)
{
unlock_page(p);
return MF_IGNORED;
@@ -669,9 +701,9 @@ static int me_kernel(struct page *p, unsigned long pfn)
/*
* Page in unknown state. Do nothing.
*/
-static int me_unknown(struct page *p, unsigned long pfn)
+static int me_unknown(struct page_state *ps, struct page *p)
{
- pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
+ pr_err("Memory failure: %#lx: Unknown page state\n", page_to_pfn(p));
unlock_page(p);
return MF_FAILED;
}
@@ -679,7 +711,7 @@ static int me_unknown(struct page *p, unsigned long pfn)
/*
* Clean (or cleaned) page cache page.
*/
-static int me_pagecache_clean(struct page *p, unsigned long pfn)
+static int me_pagecache_clean(struct page_state *ps, struct page *p)
{
int ret;
struct address_space *mapping;
@@ -716,9 +748,13 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
*
* Open: to take i_mutex or not for this? Right now we don't.
*/
- ret = truncate_error_page(p, pfn, mapping);
+ ret = truncate_error_page(p, page_to_pfn(p), mapping);
out:
unlock_page(p);
+
+ if (has_extra_refcount(ps, p, false))
+ ret = MF_FAILED;
+
return ret;
}
@@ -727,7 +763,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
* Issues: when the error hit a hole page the error is not properly
* propagated.
*/
-static int me_pagecache_dirty(struct page *p, unsigned long pfn)
+static int me_pagecache_dirty(struct page_state *ps, struct page *p)
{
struct address_space *mapping = page_mapping(p);
@@ -771,7 +807,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
mapping_set_error(mapping, -EIO);
}
- return me_pagecache_clean(p, pfn);
+ return me_pagecache_clean(ps, p);
}
/*
@@ -793,9 +829,10 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
* Clean swap cache pages can be directly isolated. A later page fault will
* bring in the known good data from disk.
*/
-static int me_swapcache_dirty(struct page *p, unsigned long pfn)
+static int me_swapcache_dirty(struct page_state *ps, struct page *p)
{
int ret;
+ bool extra_pins = false;
ClearPageDirty(p);
/* Trigger EIO in shmem: */
@@ -803,10 +840,17 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn)
ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED;
unlock_page(p);
+
+ if (ret == MF_DELAYED)
+ extra_pins = true;
+
+ if (has_extra_refcount(ps, p, extra_pins))
+ ret = MF_FAILED;
+
return ret;
}
-static int me_swapcache_clean(struct page *p, unsigned long pfn)
+static int me_swapcache_clean(struct page_state *ps, struct page *p)
{
int ret;
@@ -814,6 +858,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED;
unlock_page(p);
+
+ if (has_extra_refcount(ps, p, false))
+ ret = MF_FAILED;
+
return ret;
}
@@ -823,7 +871,7 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
* - Error on hugepage is contained in hugepage unit (not in raw page unit.)
* To narrow down kill region to one page, we need to break up pmd.
*/
-static int me_huge_page(struct page *p, unsigned long pfn)
+static int me_huge_page(struct page_state *ps, struct page *p)
{
int res;
struct page *hpage = compound_head(p);
@@ -834,7 +882,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
mapping = page_mapping(hpage);
if (mapping) {
- res = truncate_error_page(hpage, pfn, mapping);
+ res = truncate_error_page(hpage, page_to_pfn(p), mapping);
unlock_page(hpage);
} else {
res = MF_FAILED;
@@ -852,6 +900,9 @@ static int me_huge_page(struct page *p, unsigned long pfn)
}
}
+ if (has_extra_refcount(ps, p, false))
+ res = MF_FAILED;
+
return res;
}
@@ -878,14 +929,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
#define slab (1UL << PG_slab)
#define reserved (1UL << PG_reserved)
-static struct page_state {
- unsigned long mask;
- unsigned long res;
- enum mf_action_page_type type;
-
- /* Callback ->action() has to unlock the relevant page inside it. */
- int (*action)(struct page *p, unsigned long pfn);
-} error_states[] = {
+static struct page_state error_states[] = {
{ reserved, reserved, MF_MSG_KERNEL, me_kernel },
/*
* free pages are specially detected outside this table:
@@ -946,19 +990,10 @@ static int page_action(struct page_state *ps, struct page *p,
unsigned long pfn)
{
int result;
- int count;
/* page p should be unlocked after returning from ps->action(). */
- result = ps->action(p, pfn);
+ result = ps->action(ps, p);
- count = page_count(p) - 1;
- if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
- count--;
- if (count > 0) {
- pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
- pfn, action_page_types[ps->type], count);
- result = MF_FAILED;
- }
action_result(pfn, ps->type, result);
/* Could do more checks here if page looks ok */
--
2.25.1
1
2

[PATCH openEuler-1.0-LTS 1/6] dhugetlb: backport dynamic hugetlb feature
by Yongqiang Liu 04 Feb '23
by Yongqiang Liu 04 Feb '23
04 Feb '23
From: Liu Shixin <liushixin2(a)hauwei.com>
hulk inclusion
category: feature
bugzilla: 46904, https://gitee.com/openeuler/kernel/issues/I6BDME
CVE: NA
--------------------------------
This feature has already beed supported on x86_64 and this is the origin
description:
Dynamic hugetlb which is based on Hugetlb, supports to be splited
dynamically in a specified cgroup. We add a hugetlb_pool in a
mem_cgroup to manage dynamic hugetlb for corresponding cgroup.
After dynamic hugepages are allocated for a cgroup, these hugepages
can be used as 1G/2M/4K pages by split/merge opreation.
It is now supported on arm64. This feature will be limited to depends on
ARM64_4K_PAGES and not support cont-bits hugepage. We merge the previous
patches into one patch which is patch[1]. While merge the code ,we found
some code can be isolated by config DYNAMIC_HUGETLB, so we add patch[2] to
re-isolated them. In patch[3], we restrict the feature on mentioned limit.
The patch[4] add skip of dissolve hugepage which may conflict with memory
hotplug and memory failure. The patch[5] set DYNAMIC_HUGETLB to y in
hulk_defconfig to enable by default.
This patch includes all previous patches and the patches list is recorded
in bugzilla.
Signed-off-by: Liu Shixin <liushixin2(a)hauwei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang(a)huawei.com>
Signed-off-by: Yongqiang Liu <liuyongqiang13(a)huawei.com>
---
fs/Kconfig | 9 +
fs/hugetlbfs/inode.c | 4 +
include/linux/gfp.h | 4 +-
include/linux/hugetlb.h | 97 +++
include/linux/memcontrol.h | 15 +
include/linux/page-flags.h | 3 +
include/trace/events/dhugetlb.h | 123 ++++
include/trace/events/mmflags.h | 1 +
kernel/cgroup/cgroup.c | 6 +
mm/huge_memory.c | 16 +-
mm/hugetlb.c | 1188 ++++++++++++++++++++++++++++++-
mm/internal.h | 1 +
mm/memcontrol.c | 391 ++++++++++
mm/page_alloc.c | 33 +-
14 files changed, 1862 insertions(+), 29 deletions(-)
create mode 100644 include/trace/events/dhugetlb.h
diff --git a/fs/Kconfig b/fs/Kconfig
index 5921bfbebee4..e8800d8a73b3 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -211,6 +211,15 @@ config TMPFS_INODE64
If unsure, say N.
+config DYNAMIC_HUGETLB
+ bool "Dynamic HugeTLB"
+ depends on HUGETLB_PAGE
+ depends on MEMCG
+ depends on CGROUP_HUGETLB
+ help
+ Dynamic hugepage are used in memcg and can be splited into small pages
+ automatically. The tasks in the memcg prefer to alloc dynamic hugepage.
+
config HUGETLBFS
bool "HugeTLB file system support"
depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 005e05c442c5..30a29936372c 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1164,6 +1164,8 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
* private inode. This simplifies hugetlbfs_destroy_inode.
*/
mpol_shared_policy_init(&p->policy, NULL);
+ /* Initialize hpool here in case of a quick call to destroy */
+ p->hpool = get_dhugetlb_pool_from_task(current);
return &p->vfs_inode;
}
@@ -1178,6 +1180,8 @@ static void hugetlbfs_destroy_inode(struct inode *inode)
{
hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
+ dhugetlb_pool_put(HUGETLBFS_I(inode)->hpool);
+ HUGETLBFS_I(inode)->hpool = NULL;
call_rcu(&inode->i_rcu, hugetlbfs_i_callback);
}
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 152cb9bdf436..74b0375d7d2b 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -501,7 +501,9 @@ static inline void arch_alloc_page(struct page *page, int order) { }
struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
nodemask_t *nodemask);
-
+void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
+ unsigned int alloc_flags);
+bool free_pages_prepare(struct page *page, unsigned int order, bool check_free);
static inline struct page *
__alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid)
{
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 2d2b06b36bd0..3a82ea9283ec 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -289,6 +289,7 @@ struct hugetlbfs_inode_info {
struct shared_policy policy;
struct inode vfs_inode;
unsigned int seals;
+ struct dhugetlb_pool *hpool;
};
static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
@@ -655,6 +656,102 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr
#endif /* CONFIG_HUGETLB_PAGE */
+#ifdef CONFIG_DYNAMIC_HUGETLB
+/* The number of small_page_pool for a dhugetlb_pool */
+#define NR_SMPOOL num_possible_cpus()
+/* The max page number in a small_page_pool */
+#define MAX_SMPOOL_PAGE 1024
+/* number to move between list */
+#define BATCH_SMPOOL_PAGE (MAX_SMPOOL_PAGE >> 2)
+/* We don't need to try 5 times, or we can't migrate the pages. */
+#define HPOOL_RECLAIM_RETRIES 5
+
+extern struct static_key_false dhugetlb_enabled_key;
+#define dhugetlb_enabled (static_branch_unlikely(&dhugetlb_enabled_key))
+
+#define DEFAULT_PAGESIZE 4096
+extern rwlock_t dhugetlb_pagelist_rwlock;
+struct dhugetlb_pagelist {
+ unsigned long count;
+ struct dhugetlb_pool *hpool[0];
+};
+extern struct dhugetlb_pagelist *dhugetlb_pagelist_t;
+
+struct split_pages {
+ struct list_head list;
+ unsigned long start_pfn;
+ unsigned long free_pages;
+};
+
+struct small_page_pool {
+ spinlock_t lock;
+ unsigned long free_pages;
+ long used_pages;
+ struct list_head head_page;
+};
+
+struct dhugetlb_pool {
+ int nid;
+ spinlock_t lock;
+ spinlock_t reserved_lock;
+ atomic_t refcnt;
+
+ struct mem_cgroup *attach_memcg;
+
+ struct list_head dhugetlb_1G_freelists;
+ struct list_head dhugetlb_2M_freelists;
+ struct list_head dhugetlb_4K_freelists;
+
+ struct list_head split_1G_freelists;
+ struct list_head split_2M_freelists;
+
+ unsigned long total_nr_pages;
+
+ unsigned long total_reserved_1G;
+ unsigned long free_reserved_1G;
+ unsigned long mmap_reserved_1G;
+ unsigned long used_1G;
+ unsigned long free_unreserved_1G;
+ unsigned long nr_split_1G;
+
+ unsigned long total_reserved_2M;
+ unsigned long free_reserved_2M;
+ unsigned long mmap_reserved_2M;
+ unsigned long used_2M;
+ unsigned long free_unreserved_2M;
+ unsigned long nr_split_2M;
+
+ unsigned long free_pages;
+ struct small_page_pool smpool[0];
+};
+
+bool dhugetlb_pool_get(struct dhugetlb_pool *hpool);
+void dhugetlb_pool_put(struct dhugetlb_pool *hpool);
+struct dhugetlb_pool *hpool_alloc(unsigned long nid);
+int alloc_hugepage_from_hugetlb(struct dhugetlb_pool *hpool,
+ unsigned long nid, unsigned long size);
+bool free_dhugetlb_pool(struct dhugetlb_pool *hpool);
+int update_dhugetlb_pagelist(unsigned long idx, struct dhugetlb_pool *hpool);
+struct dhugetlb_pool *get_dhugetlb_pool_from_dhugetlb_pagelist(
+ struct page *page);
+struct dhugetlb_pool *get_dhugetlb_pool_from_task(struct task_struct *tsk);
+bool move_pages_from_hpool_to_smpool(struct dhugetlb_pool *hpool,
+ struct small_page_pool *smpool);
+void move_pages_from_smpool_to_hpool(struct dhugetlb_pool *hpool,
+ struct small_page_pool *smpool);
+void dhugetlb_reserve_hugepages(struct dhugetlb_pool *hpool,
+ unsigned long count, bool gigantic);
+#else
+#define dhugetlb_enabled 0
+struct dhugetlb_pool {};
+static inline struct dhugetlb_pool *get_dhugetlb_pool_from_task(
+ struct task_struct *tsk)
+{
+ return NULL;
+}
+static inline void dhugetlb_pool_put(struct dhugetlb_pool *hpool) { return; }
+#endif /* CONFIG_DYNAMIC_HUGETLB */
+
static inline spinlock_t *huge_pte_lock(struct hstate *h,
struct mm_struct *mm, pte_t *pte)
{
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 4517d132d1e2..22f40d5e0e8b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -326,6 +326,7 @@ struct mem_cgroup {
};
struct mem_cgroup_extension {
+ struct dhugetlb_pool *hpool;
#ifdef CONFIG_MEMCG_QOS
/* Currently support 0 and -1.
* in the future it can expand to other value.
@@ -1406,4 +1407,18 @@ static inline void memcg_put_cache_ids(void)
#endif /* CONFIG_MEMCG_KMEM */
+#ifdef CONFIG_DYNAMIC_HUGETLB
+struct dhugetlb_pool *get_dhugetlb_pool_from_memcg(struct mem_cgroup *memcg);
+struct page *alloc_page_from_dhugetlb_pool(gfp_t gfp_mask);
+void free_page_to_dhugetlb_pool(struct page *page);
+int dhugetlb_pool_force_empty(struct mem_cgroup *memcg);
+bool dhugetlb_pool_is_free(struct cgroup_subsys_state *css);
+#else
+static inline struct page *alloc_page_from_dhugetlb_pool(gfp_t gfp_mask)
+{
+ return NULL;
+}
+static inline void free_page_to_dhugetlb_pool(struct page *page) {}
+#endif
+
#endif /* _LINUX_MEMCONTROL_H */
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 0c5d1c4c71e6..fd6cd68e00a2 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -102,6 +102,7 @@ enum pageflags {
PG_idle,
#endif
PG_percpu_ref,
+ PG_pool,
__NR_PAGEFLAGS,
/* Filesystems */
@@ -284,6 +285,7 @@ PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)
__PAGEFLAG(Slab, slab, PF_NO_TAIL)
__PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL)
PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */
+PAGEFLAG(Pool, pool, PF_NO_TAIL)
/* Xen */
PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND)
@@ -770,6 +772,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
1UL << PG_private | 1UL << PG_private_2 | \
1UL << PG_writeback | 1UL << PG_reserved | \
1UL << PG_slab | 1UL << PG_active | \
+ 1UL << PG_pool | \
1UL << PG_unevictable | __PG_MLOCKED)
/*
diff --git a/include/trace/events/dhugetlb.h b/include/trace/events/dhugetlb.h
new file mode 100644
index 000000000000..20b3a54589d1
--- /dev/null
+++ b/include/trace/events/dhugetlb.h
@@ -0,0 +1,123 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM dhugetlb
+
+#if !defined(_TRACE_DHUGETLB_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_DHUGETLB_H
+
+#include <linux/tracepoint.h>
+#include <trace/events/mmflags.h>
+
+#define DHUGETLB_SPLIT_1G 0x01u
+#define DHUGETLB_SPLIT_2M 0x02u
+#define DHUGETLB_MERGE_4K 0x04u
+#define DHUGETLB_MIGRATE_4K 0x08u
+#define DHUGETLB_RESV_1G 0x10u
+#define DHUGETLB_UNRESV_1G 0x20u
+#define DHUGETLB_RESV_2M 0x40u
+#define DHUGETLB_UNRESV_2M 0x80u
+#define DHUGETLB_ALLOC_1G 0x100u
+#define DHUGETLB_FREE_1G 0x200u
+#define DHUGETLB_ALLOC_2M 0x400u
+#define DHUGETLB_FREE_2M 0x800u
+
+#define __def_action_names \
+ {(unsigned long)DHUGETLB_SPLIT_1G, "split_1G_to_2M"}, \
+ {(unsigned long)DHUGETLB_SPLIT_2M, "split_2M_to_4K"}, \
+ {(unsigned long)DHUGETLB_MERGE_4K, "merge_4K_to_2M"}, \
+ {(unsigned long)DHUGETLB_MIGRATE_4K, "migrate_4K_to_2M"}, \
+ {(unsigned long)DHUGETLB_RESV_1G, "resv_1G_page"}, \
+ {(unsigned long)DHUGETLB_UNRESV_1G, "unresv_1G_page"}, \
+ {(unsigned long)DHUGETLB_RESV_2M, "resv_2M_page"}, \
+ {(unsigned long)DHUGETLB_UNRESV_2M, "unresv_2M_page"}, \
+ {(unsigned long)DHUGETLB_ALLOC_1G, "alloc_1G_page"}, \
+ {(unsigned long)DHUGETLB_FREE_1G, "free_1G_page"}, \
+ {(unsigned long)DHUGETLB_ALLOC_2M, "alloc_2M_page"}, \
+ {(unsigned long)DHUGETLB_FREE_2M, "free_2M_page"}
+
+#define show_action(action) \
+ (action) ? __print_flags(action, "", \
+ __def_action_names \
+ ) : "none"
+
+TRACE_EVENT(dhugetlb_split_merge,
+
+ TP_PROTO(const void *hpool, struct page *page, unsigned long action),
+
+ TP_ARGS(hpool, page, action),
+
+ TP_STRUCT__entry(
+ __field( const void *, hpool )
+ __field( unsigned long, pfn )
+ __field( unsigned long, action )
+ ),
+
+ TP_fast_assign(
+ __entry->hpool = hpool;
+ __entry->pfn = page ? page_to_pfn(page) : -1UL;
+ __entry->action = action;
+ ),
+
+ TP_printk("hpool=%p page=%p pfn=%lu action=%s",
+ __entry->hpool,
+ __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL,
+ __entry->pfn != -1UL ? __entry->pfn : 0,
+ show_action(__entry->action))
+);
+
+TRACE_EVENT(dhugetlb_acct_memory,
+
+ TP_PROTO(const void *hpool, unsigned long count, unsigned long action),
+
+ TP_ARGS(hpool, count, action),
+
+ TP_STRUCT__entry(
+ __field( const void *, hpool )
+ __field( unsigned long, count )
+ __field( unsigned long, action )
+ ),
+
+ TP_fast_assign(
+ __entry->hpool = hpool;
+ __entry->count = count;
+ __entry->action = action;
+ ),
+
+ TP_printk("hpool=%p action=%s, mmap_count=%lu",
+ __entry->hpool,
+ show_action(__entry->action),
+ __entry->count)
+);
+
+TRACE_EVENT(dhugetlb_alloc_free,
+
+ TP_PROTO(const void *hpool, struct page *page, unsigned long count,
+ unsigned long action),
+
+ TP_ARGS(hpool, page, count, action),
+
+ TP_STRUCT__entry(
+ __field( const void *, hpool )
+ __field( unsigned long, pfn )
+ __field( unsigned long, count )
+ __field( unsigned long, action )
+ ),
+
+ TP_fast_assign(
+ __entry->hpool = hpool;
+ __entry->pfn = page ? page_to_pfn(page) : -1UL;
+ __entry->count = count;
+ __entry->action = action;
+ ),
+
+ TP_printk("hpool=%p page=%p pfn=%lu action=%s free_count=%lu",
+ __entry->hpool,
+ __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL,
+ __entry->pfn != -1UL ? __entry->pfn : 0,
+ show_action(__entry->action),
+ __entry->count)
+);
+
+#endif /* _TRACE_DHUGETLB_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index b817bf1885a0..4d06b47129f3 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -81,6 +81,7 @@
#define __def_pageflag_names \
{1UL << PG_locked, "locked" }, \
+ {1UL << PG_pool, "pool" }, \
{1UL << PG_waiters, "waiters" }, \
{1UL << PG_error, "error" }, \
{1UL << PG_referenced, "referenced" }, \
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 7456882e1a0f..b01490b71f32 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -65,6 +65,7 @@
/* let's not notify more than 100 times per second */
#define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100)
+bool dhugetlb_pool_is_free(struct cgroup_subsys_state *css);
/*
* cgroup_mutex is the master lock. Any modification to cgroup or its
* hierarchy must be performed while holding it.
@@ -5280,6 +5281,11 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
if (css_has_online_children(&cgrp->self))
return -EBUSY;
+#ifdef CONFIG_MEMCG
+ /* If we use dynamic hugetlb, make sure dhugtlb_pool is free */
+ if (!dhugetlb_pool_is_free(cgrp->subsys[memory_cgrp_id]))
+ return -EBUSY;
+#endif
/*
* Mark @cgrp and the associated csets dead. The former prevents
* further task migration and child creation by disabling
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f8319265c1cf..484ffdbf5f45 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -396,6 +396,20 @@ static int __init hugepage_init(void)
return -EINVAL;
}
+ /*
+ * When we alloc some pages(order = 0), system may help us to alloc
+ * a page(order > 0) due to transparent hugepage. This result
+ * dynamic hugetlb to be skipped. Actually, using dynamic hugetlb
+ * means we have already optimized the program, so we should not
+ * use transparent hugepage in addition.
+ * (May result negative optimization)
+ */
+ if (dhugetlb_enabled) {
+ transparent_hugepage_flags = 0;
+ pr_info("transparent hugepage is disabled due to confilct with dynamic hugetlb\n");
+ return -EINVAL;
+ }
+
/*
* hugepages can't be allocated by the buddy allocator
*/
@@ -2946,9 +2960,9 @@ static unsigned long deferred_split_count(struct shrinker *shrink,
{
struct pglist_data *pgdata = NODE_DATA(sc->nid);
unsigned long *split_queue_len = &pgdata->split_queue_len;
+#ifdef CONFIG_MEMCG
struct mem_cgroup_extension *memcg_ext;
-#ifdef CONFIG_MEMCG
if (sc->memcg) {
memcg_ext = container_of(sc->memcg, struct mem_cgroup_extension, memcg);
split_queue_len = &memcg_ext->split_queue_len;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 495d8b5b38fc..4c8c91acd6d5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -27,6 +27,12 @@
#include <linux/jhash.h>
#include <linux/mman.h>
#include <linux/share_pool.h>
+#include <linux/kthread.h>
+#include <linux/cpuhotplug.h>
+#include <linux/freezer.h>
+#include <linux/delay.h>
+#include <linux/migrate.h>
+#include <linux/mm_inline.h>
#include <asm/page.h>
#include <asm/pgtable.h>
@@ -39,8 +45,14 @@
#include <linux/userfaultfd_k.h>
#include <linux/page_owner.h>
#include <linux/share_pool.h>
+#include <linux/memblock.h>
#include "internal.h"
+#if (defined CONFIG_DYNAMIC_HUGETLB) && (!defined __GENKSYMS__)
+#define CREATE_TRACE_POINTS
+#include <trace/events/dhugetlb.h>
+#endif
+
int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
struct hstate hstates[HUGE_MAX_HSTATE];
@@ -89,7 +101,8 @@ static inline void ClearPageHugeFreed(struct page *head)
}
/* Forward declaration */
-static int hugetlb_acct_memory(struct hstate *h, long delta);
+static int hugetlb_acct_memory(struct hstate *h, long delta,
+ struct dhugetlb_pool *hpool);
static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
{
@@ -103,7 +116,7 @@ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
if (free) {
if (spool->min_hpages != -1)
hugetlb_acct_memory(spool->hstate,
- -spool->min_hpages);
+ -spool->min_hpages, NULL);
kfree(spool);
}
}
@@ -123,7 +136,7 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
spool->hstate = h;
spool->min_hpages = min_hpages;
- if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
+ if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages, NULL)) {
kfree(spool);
return NULL;
}
@@ -149,13 +162,17 @@ void hugepage_put_subpool(struct hugepage_subpool *spool)
* a subpool minimum size must be manitained.
*/
static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
- long delta)
+ long delta, struct dhugetlb_pool *hpool)
{
long ret = delta;
if (!spool)
return ret;
+ /* Skip subpool when hugetlb file belongs to a hugetlb_pool */
+ if (dhugetlb_enabled && hpool)
+ return ret;
+
spin_lock(&spool->lock);
if (spool->max_hpages != -1) { /* maximum size accounting */
@@ -194,13 +211,17 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
* in the case where a subpool minimum size must be maintained.
*/
static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
- long delta)
+ long delta, struct dhugetlb_pool *hpool)
{
long ret = delta;
if (!spool)
return delta;
+ /* Skip subpool when hugetlb file belongs to a hugetlb_pool */
+ if (dhugetlb_enabled && hpool)
+ return ret;
+
spin_lock(&spool->lock);
if (spool->max_hpages != -1) /* maximum size accounting */
@@ -594,12 +615,13 @@ void hugetlb_fix_reserve_counts(struct inode *inode)
struct hugepage_subpool *spool = subpool_inode(inode);
long rsv_adjust;
bool reserved = false;
+ struct dhugetlb_pool *hpool = HUGETLBFS_I(inode)->hpool;
- rsv_adjust = hugepage_subpool_get_pages(spool, 1);
+ rsv_adjust = hugepage_subpool_get_pages(spool, 1, hpool);
if (rsv_adjust > 0) {
struct hstate *h = hstate_inode(inode);
- if (!hugetlb_acct_memory(h, 1))
+ if (!hugetlb_acct_memory(h, 1, hpool))
reserved = true;
} else if (!rsv_adjust) {
reserved = true;
@@ -1300,6 +1322,56 @@ static inline void ClearPageHugeTemporary(struct page *page)
page[2].mapping = NULL;
}
+#ifdef CONFIG_DYNAMIC_HUGETLB
+static void free_huge_page_to_dhugetlb_pool(struct page *page,
+ bool restore_reserve)
+{
+ struct hstate *h = page_hstate(page);
+ struct dhugetlb_pool *hpool;
+
+ hpool = get_dhugetlb_pool_from_dhugetlb_pagelist(page);
+ if (unlikely(!hpool)) {
+ pr_err("dhugetlb: free error: get hpool failed\n");
+ return;
+ }
+
+ spin_lock(&hpool->lock);
+ ClearPagePool(page);
+ set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
+ if (!hstate_is_gigantic(h)) {
+ list_add(&page->lru, &hpool->dhugetlb_2M_freelists);
+ hpool->free_reserved_2M++;
+ hpool->used_2M--;
+ if (restore_reserve) {
+ hpool->mmap_reserved_2M++;
+ trace_dhugetlb_acct_memory(hpool,
+ hpool->mmap_reserved_2M,
+ DHUGETLB_RESV_2M);
+ }
+ trace_dhugetlb_alloc_free(hpool, page, hpool->free_reserved_2M,
+ DHUGETLB_FREE_2M);
+ } else {
+ list_add(&page->lru, &hpool->dhugetlb_1G_freelists);
+ hpool->free_reserved_1G++;
+ hpool->used_1G--;
+ if (restore_reserve) {
+ hpool->mmap_reserved_1G++;
+ trace_dhugetlb_acct_memory(hpool,
+ hpool->mmap_reserved_1G,
+ DHUGETLB_RESV_1G);
+ }
+ trace_dhugetlb_alloc_free(hpool, page, hpool->free_reserved_1G,
+ DHUGETLB_FREE_1G);
+ }
+ spin_unlock(&hpool->lock);
+ dhugetlb_pool_put(hpool);
+}
+#else
+void free_huge_page_to_dhugetlb_pool(struct page *page, bool restore_reserve)
+{
+}
+#endif
+
void free_huge_page(struct page *page)
{
/*
@@ -1320,6 +1392,17 @@ void free_huge_page(struct page *page)
restore_reserve = PagePrivate(page);
ClearPagePrivate(page);
+ if (dhugetlb_enabled && PagePool(page)) {
+ spin_lock(&hugetlb_lock);
+ clear_page_huge_active(page);
+ list_del(&page->lru);
+ hugetlb_cgroup_uncharge_page(hstate_index(h),
+ pages_per_huge_page(h), page);
+ spin_unlock(&hugetlb_lock);
+ free_huge_page_to_dhugetlb_pool(page, restore_reserve);
+ return;
+ }
+
/*
* If PagePrivate() was set on page, page allocation consumed a
* reservation. If the page was associated with a subpool, there
@@ -1335,7 +1418,7 @@ void free_huge_page(struct page *page)
* after page is free. Therefore, force restore_reserve
* operation.
*/
- if (hugepage_subpool_put_pages(spool, 1) == 0)
+ if (hugepage_subpool_put_pages(spool, 1, NULL) == 0)
restore_reserve = true;
}
@@ -2211,6 +2294,81 @@ static void restore_reserve_on_error(struct hstate *h,
}
}
+#ifdef CONFIG_DYNAMIC_HUGETLB
+static struct page *__alloc_huge_page_from_dhugetlb_pool(
+ struct dhugetlb_pool *hpool, int idx, bool need_unreserved)
+{
+ unsigned long flags;
+ struct page *page = NULL;
+
+ spin_lock_irqsave(&hpool->lock, flags);
+ if (hstate_is_gigantic(&hstates[idx]) && hpool->free_reserved_1G) {
+ page = list_entry(hpool->dhugetlb_1G_freelists.next,
+ struct page, lru);
+ list_del(&page->lru);
+ hpool->free_reserved_1G--;
+ hpool->used_1G++;
+ if (need_unreserved) {
+ SetPagePrivate(page);
+ hpool->mmap_reserved_1G--;
+ trace_dhugetlb_acct_memory(hpool,
+ hpool->mmap_reserved_1G,
+ DHUGETLB_UNRESV_1G);
+ }
+ trace_dhugetlb_alloc_free(hpool, page, hpool->free_reserved_1G,
+ DHUGETLB_ALLOC_1G);
+ } else if (!hstate_is_gigantic(&hstates[idx]) &&
+ hpool->free_reserved_2M) {
+ page = list_entry(hpool->dhugetlb_2M_freelists.next,
+ struct page, lru);
+ list_del(&page->lru);
+ hpool->free_reserved_2M--;
+ hpool->used_2M++;
+ if (need_unreserved) {
+ SetPagePrivate(page);
+ hpool->mmap_reserved_2M--;
+ trace_dhugetlb_acct_memory(hpool,
+ hpool->mmap_reserved_2M,
+ DHUGETLB_UNRESV_2M);
+ }
+ trace_dhugetlb_alloc_free(hpool, page, hpool->free_reserved_2M,
+ DHUGETLB_ALLOC_2M);
+ }
+ if (page) {
+ INIT_LIST_HEAD(&page->lru);
+ set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
+ set_page_refcounted(page);
+ SetPagePool(page);
+ }
+ spin_unlock_irqrestore(&hpool->lock, flags);
+
+ return page;
+}
+
+static struct page *alloc_huge_page_from_dhugetlb_pool(
+ struct vm_area_struct *vma, int idx, int avoid_reserve,
+ long gbl_chg, struct dhugetlb_pool *hpool)
+{
+ struct page *page;
+ bool need_unreserved = false;
+
+ if (!avoid_reserve && vma_has_reserves(vma, gbl_chg))
+ need_unreserved = true;
+
+ page = __alloc_huge_page_from_dhugetlb_pool(hpool, idx,
+ need_unreserved);
+
+ return page;
+}
+#else
+static inline struct page *alloc_huge_page_from_dhugetlb_pool(
+ struct vm_area_struct *vma, int idx, int avoid_reserve,
+ long gbl_chg, struct dhugetlb_pool *hpool)
+{
+ return NULL;
+}
+#endif
+
struct page *alloc_huge_page(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve)
{
@@ -2221,6 +2379,8 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
long gbl_chg;
int ret, idx;
struct hugetlb_cgroup *h_cg;
+ struct dhugetlb_pool *hpool =
+ HUGETLBFS_I(file_inode(vma->vm_file))->hpool;
idx = hstate_index(h);
/*
@@ -2240,7 +2400,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
* checked against any subpool limit.
*/
if (map_chg || avoid_reserve) {
- gbl_chg = hugepage_subpool_get_pages(spool, 1);
+ gbl_chg = hugepage_subpool_get_pages(spool, 1, hpool);
if (gbl_chg < 0) {
vma_end_reservation(h, vma, addr);
return ERR_PTR(-ENOSPC);
@@ -2262,6 +2422,26 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
if (ret)
goto out_subpool_put;
+ if (dhugetlb_enabled && hpool) {
+ page = alloc_huge_page_from_dhugetlb_pool(vma, idx,
+ avoid_reserve,
+ gbl_chg, hpool);
+ if (page) {
+ /*
+ * Use hugetlb_lock to manage the account of
+ * hugetlb cgroup.
+ */
+ spin_lock(&hugetlb_lock);
+ list_add(&page->lru, &h->hugepage_activelist);
+ hugetlb_cgroup_commit_charge(idx,
+ pages_per_huge_page(hstate_vma(vma)),
+ h_cg, page);
+ spin_unlock(&hugetlb_lock);
+ goto out;
+ }
+ goto out_uncharge_cgroup;
+ }
+
spin_lock(&hugetlb_lock);
/*
* glb_chg is passed to indicate whether or not a page must be taken
@@ -2284,7 +2464,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
}
hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
spin_unlock(&hugetlb_lock);
-
+out:
set_page_private(page, (unsigned long)spool);
map_commit = vma_commit_reservation(h, vma, addr);
@@ -2300,8 +2480,8 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
*/
long rsv_adjust;
- rsv_adjust = hugepage_subpool_put_pages(spool, 1);
- hugetlb_acct_memory(h, -rsv_adjust);
+ rsv_adjust = hugepage_subpool_put_pages(spool, 1, hpool);
+ hugetlb_acct_memory(h, -rsv_adjust, hpool);
}
return page;
@@ -2309,7 +2489,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
out_subpool_put:
if (map_chg || avoid_reserve)
- hugepage_subpool_put_pages(spool, 1);
+ hugepage_subpool_put_pages(spool, 1, hpool);
vma_end_reservation(h, vma, addr);
return ERR_PTR(-ENOSPC);
}
@@ -3098,6 +3278,932 @@ static void hugetlb_register_all_nodes(void) { }
#endif
+#ifdef CONFIG_DYNAMIC_HUGETLB
+static bool enable_dhugetlb;
+DEFINE_STATIC_KEY_FALSE(dhugetlb_enabled_key);
+DEFINE_RWLOCK(dhugetlb_pagelist_rwlock);
+struct dhugetlb_pagelist *dhugetlb_pagelist_t;
+
+bool dhugetlb_pool_get(struct dhugetlb_pool *hpool)
+{
+ if (!hpool)
+ return false;
+
+ return atomic_inc_not_zero(&hpool->refcnt);
+}
+
+void dhugetlb_pool_put(struct dhugetlb_pool *hpool)
+{
+ if (!dhugetlb_enabled || !hpool)
+ return;
+
+ if (atomic_dec_and_test(&hpool->refcnt)) {
+ css_put(&hpool->attach_memcg->css);
+ kfree(hpool);
+ }
+}
+
+struct dhugetlb_pool *hpool_alloc(unsigned long nid)
+{
+ int i;
+ struct dhugetlb_pool *hpool;
+
+ hpool = kzalloc(sizeof(struct dhugetlb_pool) +
+ NR_SMPOOL * sizeof(struct small_page_pool), GFP_KERNEL);
+ if (!hpool)
+ return NULL;
+
+ spin_lock_init(&hpool->lock);
+ spin_lock_init(&hpool->reserved_lock);
+ hpool->nid = nid;
+ atomic_set(&hpool->refcnt, 1);
+ INIT_LIST_HEAD(&hpool->dhugetlb_1G_freelists);
+ INIT_LIST_HEAD(&hpool->dhugetlb_2M_freelists);
+ INIT_LIST_HEAD(&hpool->dhugetlb_4K_freelists);
+ INIT_LIST_HEAD(&hpool->split_1G_freelists);
+ INIT_LIST_HEAD(&hpool->split_2M_freelists);
+
+ for (i = 0; i < NR_SMPOOL; i++) {
+ spin_lock_init(&hpool->smpool[i].lock);
+ INIT_LIST_HEAD(&hpool->smpool[i].head_page);
+ }
+
+ return hpool;
+}
+
+int alloc_hugepage_from_hugetlb(struct dhugetlb_pool *hpool,
+ unsigned long nid, unsigned long size)
+{
+ int ret;
+ struct page *page, *next;
+ unsigned long idx;
+ unsigned long i = 0;
+ struct hstate *h = size_to_hstate(PUD_SIZE);
+
+ if (!h)
+ return -ENOMEM;
+
+ spin_lock(&hpool->lock);
+ spin_lock(&hugetlb_lock);
+ if (h->free_huge_pages_node[nid] < size) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ list_for_each_entry_safe(page, next, &h->hugepage_freelists[nid], lru) {
+ idx = page_to_pfn(page) >> (PUD_SHIFT - PAGE_SHIFT);
+ ret = update_dhugetlb_pagelist(idx, hpool);
+ if (ret)
+ continue;
+ ClearPageHugeFreed(page);
+ list_move_tail(&page->lru, &hpool->dhugetlb_1G_freelists);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[nid]--;
+ hpool->total_nr_pages++;
+ hpool->free_unreserved_1G++;
+ if (++i == size)
+ break;
+ }
+ ret = 0;
+out_unlock:
+ spin_unlock(&hugetlb_lock);
+ spin_unlock(&hpool->lock);
+ return ret;
+}
+
+/*
+ * When we assign a hugepage to dhugetlb_pool, we need to record it in
+ * dhugetlb_pagelist_t. In this situation, we just need read_lock because
+ * there is not conflit when write to dhugetlb_pagelist_t->hpool.
+ *
+ * If page's pfn is greater than dhugetlb_pagelist_t->count (which may
+ * occurs due to memory hotplug), we need to realloc enough memory so that
+ * pfn = dhugetlb_pagelist_t->count - 1 and then record it.
+ * In this situation, we need write_lock because while we are reallocating,
+ * the read request should wait.
+ */
+int update_dhugetlb_pagelist(unsigned long idx, struct dhugetlb_pool *hpool)
+{
+ read_lock(&dhugetlb_pagelist_rwlock);
+ if (idx >= dhugetlb_pagelist_t->count) {
+ unsigned long size;
+ struct dhugetlb_pagelist *tmp;
+
+ read_unlock(&dhugetlb_pagelist_rwlock);
+ write_lock(&dhugetlb_pagelist_rwlock);
+
+ size = sizeof(struct dhugetlb_pagelist) +
+ (idx + 1) * sizeof(struct dhugetlb_pool *);
+ tmp = krealloc(dhugetlb_pagelist_t, size, GFP_ATOMIC);
+ if (!tmp) {
+ write_unlock(&dhugetlb_pagelist_rwlock);
+ return -ENOMEM;
+ }
+ tmp->count = idx + 1;
+ dhugetlb_pagelist_t = tmp;
+
+ write_unlock(&dhugetlb_pagelist_rwlock);
+ read_lock(&dhugetlb_pagelist_rwlock);
+ }
+ dhugetlb_pagelist_t->hpool[idx] = hpool;
+ read_unlock(&dhugetlb_pagelist_rwlock);
+ return 0;
+}
+
+struct dhugetlb_pool *get_dhugetlb_pool_from_dhugetlb_pagelist(
+ struct page *page)
+{
+ struct dhugetlb_pool *hpool = NULL;
+ unsigned long idx = page_to_pfn(page) >> (PUD_SHIFT - PAGE_SHIFT);
+
+ read_lock(&dhugetlb_pagelist_rwlock);
+ if (idx < dhugetlb_pagelist_t->count)
+ hpool = dhugetlb_pagelist_t->hpool[idx];
+ read_unlock(&dhugetlb_pagelist_rwlock);
+ if (dhugetlb_pool_get(hpool))
+ return hpool;
+ return NULL;
+}
+
+struct dhugetlb_pool *get_dhugetlb_pool_from_task(struct task_struct *tsk)
+{
+ struct mem_cgroup *memcg;
+ struct dhugetlb_pool *hpool;
+
+ if (!dhugetlb_enabled)
+ return NULL;
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(tsk);
+ rcu_read_unlock();
+
+ hpool = get_dhugetlb_pool_from_memcg(memcg);
+
+ return hpool;
+}
+
+static void add_new_huge_page_to_pool(struct dhugetlb_pool *hpool,
+ struct page *page, bool gigantic)
+{
+ lockdep_assert_held(&hpool->lock);
+ VM_BUG_ON_PAGE(page_mapcount(page), page);
+ INIT_LIST_HEAD(&page->lru);
+
+ if (gigantic) {
+ prep_compound_gigantic_page(page, PUD_SHIFT - PAGE_SHIFT);
+ list_add_tail(&page->lru, &hpool->dhugetlb_1G_freelists);
+ hpool->free_unreserved_1G++;
+ } else {
+ prep_new_page(page, PMD_SHIFT - PAGE_SHIFT, __GFP_COMP, 0);
+ set_page_count(page, 0);
+ list_add_tail(&page->lru, &hpool->dhugetlb_2M_freelists);
+ hpool->free_unreserved_2M++;
+ }
+ set_page_private(page, 0);
+ page->mapping = NULL;
+ set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
+ set_hugetlb_cgroup(page, NULL);
+}
+
+static void free_dhugetlb_pcpool(struct dhugetlb_pool *hpool)
+{
+ int i;
+ struct small_page_pool *smpool;
+
+ for (i = 0; i < NR_SMPOOL; i++) {
+ smpool = &hpool->smpool[i];
+ list_splice(&smpool->head_page, &hpool->dhugetlb_4K_freelists);
+ smpool->free_pages = 0;
+ smpool->used_pages = 0;
+ INIT_LIST_HEAD(&smpool->head_page);
+ }
+}
+
+static void __free_dhugetlb_small_page(struct dhugetlb_pool *hpool)
+{
+ struct page *page, *next;
+ struct split_pages *split_huge, *split_next;
+
+ if (list_empty(&hpool->dhugetlb_4K_freelists))
+ return;
+
+ list_for_each_entry_safe(page, next,
+ &hpool->dhugetlb_4K_freelists, lru) {
+ list_del(&page->lru);
+ add_new_huge_page_to_pool(hpool, page, false);
+ }
+
+ list_for_each_entry_safe(split_huge, split_next,
+ &hpool->split_2M_freelists, list) {
+ list_del(&split_huge->list);
+ kfree(split_huge);
+ hpool->nr_split_2M--;
+ }
+
+ hpool->free_pages = 0;
+ INIT_LIST_HEAD(&hpool->dhugetlb_4K_freelists);
+}
+
+static void free_dhugetlb_small_page(struct dhugetlb_pool *hpool)
+{
+ struct page *page, *next;
+ unsigned long nr_pages = 1 << (PMD_SHIFT - PAGE_SHIFT);
+
+ lockdep_assert_held(&hpool->lock);
+ if (list_empty(&hpool->dhugetlb_4K_freelists))
+ return;
+
+ list_for_each_entry_safe(page, next,
+ &hpool->dhugetlb_4K_freelists, lru) {
+ if (page_to_pfn(page) % nr_pages != 0)
+ list_del(&page->lru);
+ }
+
+ __free_dhugetlb_small_page(hpool);
+}
+
+static void __free_dhugetlb_huge_page(struct dhugetlb_pool *hpool)
+{
+ struct page *page, *next;
+ struct split_pages *split_giga, *split_next;
+
+ if (list_empty(&hpool->dhugetlb_2M_freelists))
+ return;
+
+ list_for_each_entry_safe(page, next,
+ &hpool->dhugetlb_2M_freelists, lru) {
+ list_del(&page->lru);
+ add_new_huge_page_to_pool(hpool, page, true);
+ }
+ list_for_each_entry_safe(split_giga, split_next,
+ &hpool->split_1G_freelists, list) {
+ list_del(&split_giga->list);
+ kfree(split_giga);
+ hpool->nr_split_1G--;
+ }
+
+ hpool->total_reserved_2M = 0;
+ hpool->free_reserved_2M = 0;
+ hpool->free_unreserved_2M = 0;
+ INIT_LIST_HEAD(&hpool->dhugetlb_2M_freelists);
+}
+
+static void free_dhugetlb_huge_page(struct dhugetlb_pool *hpool)
+{
+ struct page *page, *next;
+ unsigned long nr_pages = 1 << (PUD_SHIFT - PAGE_SHIFT);
+ unsigned long block_size = 1 << (PMD_SHIFT - PAGE_SHIFT);
+ int i;
+
+ lockdep_assert_held(&hpool->lock);
+ if (list_empty(&hpool->dhugetlb_2M_freelists))
+ return;
+
+ list_for_each_entry_safe(page, next,
+ &hpool->dhugetlb_2M_freelists, lru) {
+ set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
+ atomic_set(compound_mapcount_ptr(page), 0);
+ for (i = 1; i < block_size; i++)
+ clear_compound_head(&page[i]);
+ set_compound_order(page, 0);
+ __ClearPageHead(page);
+ if (page_to_pfn(page) % nr_pages != 0)
+ list_del(&page->lru);
+ }
+ __free_dhugetlb_huge_page(hpool);
+}
+
+static int try_migrate_page(struct page *page, unsigned long nid)
+{
+ unsigned long pfn = page_to_pfn(page);
+ int ret = 0;
+
+ LIST_HEAD(source);
+
+ if (!pfn_valid(pfn))
+ return 0;
+ BUG_ON(PageHuge(page) || PageTransHuge(page));
+ /*
+ * HWPoison pages have elevated reference counts so the migration
+ * would fail on them. It also doesn't make any sense to migrate them
+ * in the first place. Still try to unmap such a page in case it is
+ * still mapped(e.g. current hwpoison implementation doesn't unmap
+ * KSM pages but keep the unmap as the catch all safety net).
+ */
+ if (PageHWPoison(page)) {
+ if (WARN_ON(PageLRU(page)))
+ isolate_lru_page(page);
+ if (page_mapped(page))
+ try_to_unmap(page,
+ TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS);
+ return 0;
+ }
+
+ if (!get_page_unless_zero(page))
+ return 0;
+ /*
+ * We can skip free pages. And we can deal with pages on
+ * LRU and non-lru movable pages.
+ */
+ if (PageLRU(page))
+ ret = isolate_lru_page(page);
+ else
+ ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
+ put_page(page);
+ if (ret) {
+ if (page_count(page))
+ ret = -EBUSY;
+ return ret;
+ }
+ list_add_tail(&page->lru, &source);
+ if (!__PageMovable(page))
+ inc_node_page_state(page,
+ NR_ISOLATED_ANON + page_is_file_cache(page));
+
+ ret = migrate_pages(&source, alloc_new_node_page, NULL, nid,
+ MIGRATE_SYNC_LIGHT, MR_COMPACTION);
+ if (ret)
+ putback_movable_pages(&source);
+ return ret;
+}
+
+static void try_migrate_pages(struct dhugetlb_pool *hpool)
+{
+ int i, j;
+ unsigned long nr_free_pages;
+ struct split_pages *split_giga, *next;
+ unsigned int nr_pages = 1 << (PMD_SHIFT - PAGE_SHIFT);
+ struct page *page;
+ int sleep_interval = 100; /* wait for the migration */
+
+ spin_unlock(&hpool->lock);
+ for (i = NR_SMPOOL - 1; i >= 0; i--)
+ spin_unlock(&hpool->smpool[i].lock);
+
+ msleep(sleep_interval);
+ dhugetlb_pool_force_empty(hpool->attach_memcg);
+
+ spin_lock(&hpool->lock);
+ nr_free_pages = hpool->free_pages;
+ spin_unlock(&hpool->lock);
+ for (i = 0; i < NR_SMPOOL; i++) {
+ spin_lock(&hpool->smpool[i].lock);
+ nr_free_pages += hpool->smpool[i].free_pages;
+ spin_unlock(&hpool->smpool[i].lock);
+ }
+
+ if (nr_free_pages >> HUGETLB_PAGE_ORDER < hpool->nr_split_2M) {
+ list_for_each_entry_safe(split_giga, next,
+ &hpool->split_1G_freelists, list) {
+ for (i = 0; i < nr_pages; i++) {
+ if (PageCompound(pfn_to_page(
+ split_giga->start_pfn + i * nr_pages)))
+ continue;
+ page = pfn_to_page(split_giga->start_pfn +
+ i * nr_pages);
+ for (j = 0; j < nr_pages; j++) {
+ if (PagePool(page + j))
+ try_migrate_page(page + j,
+ hpool->nid);
+ }
+ }
+ }
+ }
+
+ for (i = 0; i < NR_SMPOOL; i++)
+ spin_lock(&hpool->smpool[i].lock);
+ spin_lock(&hpool->lock);
+}
+
+/*
+ * If there are some pages are still in use. We will try to reclaim/migrate it.
+ * After trying at most HPOOL_RECLAIM_RETRIES times, we may success.
+ * Or we will print the failed information and return false.
+ */
+static bool free_dhugetlb_pages(struct dhugetlb_pool *hpool)
+{
+ int i;
+ long used_pages;
+ int try_count = 0;
+
+retry:
+ used_pages = 0;
+ for (i = 0; i < NR_SMPOOL; i++)
+ used_pages += hpool->smpool[i].used_pages;
+
+ if (try_count < HPOOL_RECLAIM_RETRIES &&
+ (used_pages || hpool->used_2M || hpool->used_1G)) {
+ try_migrate_pages(hpool);
+ try_count++;
+ goto retry;
+ }
+
+ if (used_pages)
+ pr_err("dhugetlb: some 4K pages not free, memcg: %s delete failed!\n",
+ hpool->attach_memcg->css.cgroup->kn->name);
+ else if (hpool->used_2M)
+ pr_err("dhugetlb: some 2M pages not free, memcg: %s delete failed!\n",
+ hpool->attach_memcg->css.cgroup->kn->name);
+ else if (hpool->used_1G)
+ pr_err("dhugetlb: some 1G pages not free, memcg: %s delete failed!\n",
+ hpool->attach_memcg->css.cgroup->kn->name);
+ else {
+ free_dhugetlb_pcpool(hpool);
+ free_dhugetlb_small_page(hpool);
+ free_dhugetlb_huge_page(hpool);
+ return true;
+ }
+ return false;
+}
+
+static void free_back_hugetlb(struct dhugetlb_pool *hpool)
+{
+ int nid;
+ unsigned int nr_pages;
+ unsigned long pfn, idx;
+ struct page *page, *page_next, *p;
+ struct hstate *h = size_to_hstate(PUD_SIZE);
+
+ if (!h)
+ return;
+
+ spin_lock(&hugetlb_lock);
+ list_for_each_entry_safe(page, page_next,
+ &hpool->dhugetlb_1G_freelists, lru) {
+ nr_pages = 1 << huge_page_order(h);
+ pfn = page_to_pfn(page);
+ for (; nr_pages--; pfn++) {
+ p = pfn_to_page(pfn);
+ p->mapping = NULL;
+ }
+ SetPageHugeFreed(page);
+ set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
+ nid = page_to_nid(page);
+ BUG_ON(nid >= MAX_NUMNODES);
+ list_move(&page->lru, &h->hugepage_freelists[nid]);
+ h->free_huge_pages_node[nid]++;
+ read_lock(&dhugetlb_pagelist_rwlock);
+ idx = page_to_pfn(page) >> (PUD_SHIFT - PAGE_SHIFT);
+ if (idx < dhugetlb_pagelist_t->count)
+ dhugetlb_pagelist_t->hpool[idx] = NULL;
+ read_unlock(&dhugetlb_pagelist_rwlock);
+ }
+ h->free_huge_pages += hpool->total_nr_pages;
+ hpool->total_nr_pages = 0;
+ hpool->free_unreserved_1G = 0;
+ hpool->free_reserved_1G = 0;
+ hpool->total_reserved_1G = 0;
+ INIT_LIST_HEAD(&hpool->dhugetlb_1G_freelists);
+ spin_unlock(&hugetlb_lock);
+}
+
+bool free_dhugetlb_pool(struct dhugetlb_pool *hpool)
+{
+ int i;
+ bool ret = false;
+
+ for (i = 0; i < NR_SMPOOL; i++)
+ spin_lock(&hpool->smpool[i].lock);
+ spin_lock(&hpool->lock);
+
+ ret = free_dhugetlb_pages(hpool);
+ if (!ret)
+ goto out_unlock;
+
+ free_back_hugetlb(hpool);
+
+out_unlock:
+ spin_unlock(&hpool->lock);
+ for (i = NR_SMPOOL - 1; i >= 0; i--)
+ spin_unlock(&hpool->smpool[i].lock);
+
+ if (ret)
+ dhugetlb_pool_put(hpool);
+ return ret;
+}
+
+static void __split_free_huge_page(struct dhugetlb_pool *hpool,
+ struct page *page)
+{
+ int i;
+ int order_h = PUD_SHIFT - PAGE_SHIFT;
+ int order_m = PMD_SHIFT - PAGE_SHIFT;
+ int blocks = 1 << (order_h - order_m);
+ struct page *p = page + 1;
+
+ lockdep_assert_held(&hpool->lock);
+ set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
+ atomic_set(compound_mapcount_ptr(page), 0);
+ for (i = 1; i < (1 << order_h); i++, p = mem_map_next(p, page, i))
+ clear_compound_head(p);
+
+ set_compound_order(page, 0);
+ __ClearPageHead(page);
+
+ /* make it be 2M huge pages and put it to huge pool */
+ for (i = 0; i < blocks; i++, page += (1 << order_m))
+ add_new_huge_page_to_pool(hpool, page, false);
+}
+
+static void __split_free_small_page(struct dhugetlb_pool *hpool,
+ struct page *page)
+{
+ int i;
+ int nr_pages = 1 << (PMD_SHIFT - PAGE_SHIFT);
+
+ lockdep_assert_held(&hpool->lock);
+ set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
+ set_compound_order(page, 0);
+ for (i = 0; i < nr_pages; i++) {
+ if (i != 0) {
+ page[i].mapping = NULL;
+ clear_compound_head(&page[i]);
+ } else
+ __ClearPageHead(page);
+
+ /*
+ * If a hugepage is mapped in private mode, the PG_uptodate bit
+ * will not be cleared when the hugepage freed. Clear the
+ * hugepage using free_pages_prepare() here.
+ */
+ free_pages_prepare(&page[i], 0, false);
+ hpool->free_pages++;
+ list_add_tail(&page[i].lru, &hpool->dhugetlb_4K_freelists);
+ }
+}
+
+static bool split_free_huge_page(struct dhugetlb_pool *hpool)
+{
+ struct page *page;
+ struct split_pages *split_page;
+
+ lockdep_assert_held(&hpool->lock);
+
+ if (!hpool->free_unreserved_1G)
+ return false;
+
+ split_page = kzalloc(sizeof(struct split_pages), GFP_ATOMIC);
+ if (!split_page)
+ return false;
+
+ page = list_entry(hpool->dhugetlb_1G_freelists.next, struct page, lru);
+ list_del(&page->lru);
+ hpool->free_unreserved_1G--;
+
+ split_page->start_pfn = page_to_pfn(page);
+ list_add(&split_page->list, &hpool->split_1G_freelists);
+ hpool->nr_split_1G++;
+
+ trace_dhugetlb_split_merge(hpool, page, DHUGETLB_SPLIT_1G);
+
+ __split_free_huge_page(hpool, page);
+ return true;
+}
+
+static bool split_free_small_page(struct dhugetlb_pool *hpool)
+{
+ struct page *page;
+ struct split_pages *split_page;
+
+ lockdep_assert_held(&hpool->lock);
+
+ if (!hpool->free_unreserved_2M && !split_free_huge_page(hpool))
+ return false;
+
+ split_page = kzalloc(sizeof(struct split_pages), GFP_ATOMIC);
+ if (!split_page)
+ return false;
+
+ page = list_entry(hpool->dhugetlb_2M_freelists.next, struct page, lru);
+ list_del(&page->lru);
+ hpool->free_unreserved_2M--;
+
+ split_page->start_pfn = page_to_pfn(page);
+ list_add(&split_page->list, &hpool->split_2M_freelists);
+ hpool->nr_split_2M++;
+
+ trace_dhugetlb_split_merge(hpool, page, DHUGETLB_SPLIT_2M);
+
+ __split_free_small_page(hpool, page);
+ return true;
+}
+
+bool move_pages_from_hpool_to_smpool(struct dhugetlb_pool *hpool,
+ struct small_page_pool *smpool)
+{
+ int i = 0;
+ struct page *page, *next;
+
+ if (!hpool->free_pages && !split_free_small_page(hpool))
+ return false;
+
+ list_for_each_entry_safe(page, next,
+ &hpool->dhugetlb_4K_freelists, lru) {
+ list_del(&page->lru);
+ hpool->free_pages--;
+ list_add_tail(&page->lru, &smpool->head_page);
+ smpool->free_pages++;
+ if (++i == BATCH_SMPOOL_PAGE)
+ break;
+ }
+ return true;
+}
+
+void move_pages_from_smpool_to_hpool(struct dhugetlb_pool *hpool,
+ struct small_page_pool *smpool)
+{
+ int i = 0;
+ struct page *page, *next;
+
+ list_for_each_entry_safe(page, next, &smpool->head_page, lru) {
+ list_del(&page->lru);
+ smpool->free_pages--;
+ list_add(&page->lru, &hpool->dhugetlb_4K_freelists);
+ hpool->free_pages++;
+ if (++i == BATCH_SMPOOL_PAGE)
+ break;
+ }
+}
+
+static unsigned long list_len(struct list_head *head)
+{
+ unsigned long len = 0;
+ struct page *page;
+
+ list_for_each_entry(page, head, lru)
+ len++;
+
+ return len;
+}
+
+static void hugetlb_migrate_pages(struct dhugetlb_pool *hpool,
+ unsigned long count)
+{
+ int i, try;
+ struct page *page;
+ struct split_pages *split_huge, *split_next;
+ unsigned long nr_pages = 1 << (PMD_SHIFT - PAGE_SHIFT);
+ LIST_HEAD(wait_page_list);
+
+ list_for_each_entry_safe(split_huge, split_next,
+ &hpool->split_2M_freelists, list) {
+ /*
+ * Isolate free page first because we dont want them to be
+ * allocated.
+ */
+ for (i = 0; i < nr_pages; i++) {
+ page = pfn_to_page(split_huge->start_pfn + i);
+ if (!PagePool(page))
+ list_move(&page->lru, &wait_page_list);
+ }
+
+ for (try = 0; try < HPOOL_RECLAIM_RETRIES; try++) {
+ /*
+ * Unlock and try migration, after migration we need
+ * to lock back.
+ */
+ for (i = 0; i < NR_SMPOOL; i++)
+ hpool->smpool[i].free_pages =
+ list_len(&hpool->smpool[i].head_page);
+ hpool->free_pages =
+ list_len(&hpool->dhugetlb_4K_freelists);
+ spin_unlock(&hpool->lock);
+ for (i = NR_SMPOOL - 1; i >= 0; i--)
+ spin_unlock(&hpool->smpool[i].lock);
+
+ for (i = 0; i < nr_pages; i++) {
+ page = pfn_to_page(split_huge->start_pfn + i);
+ if (PagePool(page))
+ try_migrate_page(page, hpool->nid);
+ }
+ for (i = 0; i < NR_SMPOOL; i++)
+ spin_lock(&hpool->smpool[i].lock);
+ spin_lock(&hpool->lock);
+
+ /*
+ * Isolate free page. If all page in the split_huge
+ * is free, return it.
+ */
+ split_huge->free_pages = 0;
+ for (i = 0; i < nr_pages; i++) {
+ page = pfn_to_page(split_huge->start_pfn + i);
+ if (!PagePool(page)) {
+ list_move(&page->lru, &wait_page_list);
+ split_huge->free_pages++;
+ }
+ }
+ if (split_huge->free_pages == nr_pages)
+ break;
+ }
+ if (split_huge->free_pages == nr_pages) {
+ for (i = 0; i < nr_pages; i++) {
+ page = pfn_to_page(split_huge->start_pfn + i);
+ list_del(&page->lru);
+ }
+ INIT_LIST_HEAD(&wait_page_list);
+ page = pfn_to_page(split_huge->start_pfn);
+ add_new_huge_page_to_pool(hpool, page, false);
+ list_del(&split_huge->list);
+ kfree(split_huge);
+ hpool->nr_split_2M--;
+
+ trace_dhugetlb_split_merge(hpool, page,
+ DHUGETLB_MIGRATE_4K);
+
+ if (--count == 0)
+ return;
+ } else {
+ /* Failed, put back the isolate pages */
+ list_splice(&wait_page_list,
+ &hpool->dhugetlb_4K_freelists);
+ INIT_LIST_HEAD(&wait_page_list);
+ }
+ }
+}
+
+static unsigned long merge_free_split_huge(struct dhugetlb_pool *hpool,
+ unsigned long count)
+{
+ int i;
+ struct page *page;
+ struct split_pages *split_huge, *split_next;
+ unsigned long nr_pages = 1 << (PMD_SHIFT - PAGE_SHIFT);
+
+ list_for_each_entry_safe(split_huge, split_next,
+ &hpool->split_2M_freelists, list) {
+ split_huge->free_pages = 0;
+ for (i = 0; i < nr_pages; i++) {
+ page = pfn_to_page(split_huge->start_pfn + i);
+ if (!PagePool(page))
+ split_huge->free_pages++;
+ }
+ if (split_huge->free_pages == nr_pages) {
+ for (i = 0; i < nr_pages; i++) {
+ page = pfn_to_page(split_huge->start_pfn + i);
+ list_del(&page->lru);
+ }
+ page = pfn_to_page(split_huge->start_pfn);
+ add_new_huge_page_to_pool(hpool, page, false);
+ list_del(&split_huge->list);
+ kfree(split_huge);
+ hpool->nr_split_2M--;
+
+ trace_dhugetlb_split_merge(hpool, page,
+ DHUGETLB_MERGE_4K);
+
+ if (--count == 0)
+ return 0;
+ }
+ }
+ return count;
+}
+
+static void merge_free_small_page(struct dhugetlb_pool *hpool,
+ unsigned long count)
+{
+ int i;
+ unsigned long need_migrate;
+
+ if (!hpool->nr_split_2M)
+ return;
+
+ need_migrate = merge_free_split_huge(hpool, count);
+ if (need_migrate)
+ hugetlb_migrate_pages(hpool, need_migrate);
+
+ for (i = 0; i < NR_SMPOOL; i++)
+ hpool->smpool[i].free_pages =
+ list_len(&hpool->smpool[i].head_page);
+ hpool->free_pages = list_len(&hpool->dhugetlb_4K_freelists);
+}
+
+static void dhugetlb_collect_2M_pages(struct dhugetlb_pool *hpool,
+ unsigned long count)
+{
+ int i;
+
+ while (hpool->free_unreserved_1G &&
+ count > hpool->free_unreserved_2M)
+ split_free_huge_page(hpool);
+
+ /*
+ * If we try to merge 4K pages to 2M, we need to unlock hpool->lock
+ * first, and then try to lock every lock in order to avoid deadlock.
+ */
+ if (count > hpool->free_unreserved_2M) {
+ spin_unlock(&hpool->lock);
+ for (i = 0; i < NR_SMPOOL; i++)
+ spin_lock(&hpool->smpool[i].lock);
+ spin_lock(&hpool->lock);
+ merge_free_small_page(hpool, count - hpool->free_unreserved_2M);
+ for (i = NR_SMPOOL - 1; i >= 0; i--)
+ spin_unlock(&hpool->smpool[i].lock);
+ }
+}
+
+/*
+ * Parameter gigantic: true means reserve 1G pages and false means reserve
+ * 2M pages. When we want to reserve 2M pages more than
+ * hpool->free_unreserved_2M, we have to try split/merge. Still, we can't
+ * guarantee success.
+ */
+void dhugetlb_reserve_hugepages(struct dhugetlb_pool *hpool,
+ unsigned long count, bool gigantic)
+{
+ unsigned long delta;
+
+ spin_lock(&hpool->lock);
+ if (gigantic) {
+ if (count > hpool->total_reserved_1G) {
+ delta = min(count - hpool->total_reserved_1G,
+ hpool->free_unreserved_1G);
+ hpool->total_reserved_1G += delta;
+ hpool->free_reserved_1G += delta;
+ hpool->free_unreserved_1G -= delta;
+ } else {
+ delta = min(hpool->total_reserved_1G - count,
+ hpool->free_reserved_1G -
+ hpool->mmap_reserved_1G);
+ hpool->total_reserved_1G -= delta;
+ hpool->free_reserved_1G -= delta;
+ hpool->free_unreserved_1G += delta;
+ }
+ } else {
+ if (count > hpool->total_reserved_2M) {
+ delta = count - hpool->total_reserved_2M;
+ if (delta > hpool->free_unreserved_2M)
+ dhugetlb_collect_2M_pages(hpool, delta);
+ delta = min(count - hpool->total_reserved_2M,
+ hpool->free_unreserved_2M);
+ hpool->total_reserved_2M += delta;
+ hpool->free_reserved_2M += delta;
+ hpool->free_unreserved_2M -= delta;
+ } else {
+ delta = min(hpool->total_reserved_2M - count,
+ hpool->free_reserved_2M -
+ hpool->mmap_reserved_2M);
+ hpool->total_reserved_2M -= delta;
+ hpool->free_reserved_2M -= delta;
+ hpool->free_unreserved_2M += delta;
+ }
+ }
+ spin_unlock(&hpool->lock);
+}
+
+static int dhugetlb_acct_memory(struct hstate *h, long delta,
+ struct dhugetlb_pool *hpool)
+{
+ int ret = -ENOMEM;
+
+ if (delta == 0)
+ return 0;
+
+ spin_lock(&hpool->lock);
+ if (hstate_is_gigantic(h)) {
+ if (delta > 0 && delta <= hpool->free_reserved_1G -
+ hpool->mmap_reserved_1G) {
+ hpool->mmap_reserved_1G += delta;
+ ret = 0;
+ trace_dhugetlb_acct_memory(hpool,
+ hpool->mmap_reserved_1G,
+ DHUGETLB_RESV_1G);
+ } else if (delta < 0) {
+ hpool->mmap_reserved_1G -= (unsigned long)(-delta);
+ WARN_ON(hpool->mmap_reserved_1G < 0);
+ ret = 0;
+ trace_dhugetlb_acct_memory(hpool,
+ hpool->mmap_reserved_1G,
+ DHUGETLB_UNRESV_1G);
+ }
+ } else {
+ if (delta > 0 && delta <= hpool->free_reserved_2M -
+ hpool->mmap_reserved_2M) {
+ hpool->mmap_reserved_2M += delta;
+ ret = 0;
+ trace_dhugetlb_acct_memory(hpool,
+ hpool->mmap_reserved_2M,
+ DHUGETLB_RESV_2M);
+ } else if (delta < 0) {
+ hpool->mmap_reserved_2M -= (unsigned long)(-delta);
+ WARN_ON(hpool->mmap_reserved_2M < 0);
+ ret = 0;
+ trace_dhugetlb_acct_memory(hpool,
+ hpool->mmap_reserved_2M,
+ DHUGETLB_UNRESV_2M);
+ }
+ }
+ spin_unlock(&hpool->lock);
+
+ return ret;
+}
+#else
+static int dhugetlb_acct_memory(struct hstate *h, long delta,
+ struct dhugetlb_pool *hpool)
+{
+ return 0;
+}
+#endif /* CONFIG_DYNAMIC_HUGETLB */
+
static int __init hugetlb_init(void)
{
int i;
@@ -3134,6 +4240,23 @@ static int __init hugetlb_init(void)
hugetlb_register_all_nodes();
hugetlb_cgroup_file_init();
+#ifdef CONFIG_DYNAMIC_HUGETLB
+ if (enable_dhugetlb) {
+ unsigned long count = max(max_pfn >> (PUD_SHIFT - PAGE_SHIFT),
+ (unsigned long)DEFAULT_PAGESIZE);
+ unsigned long size = sizeof(struct dhugetlb_pagelist) +
+ count * sizeof(struct dhugetlb_pool *);
+ dhugetlb_pagelist_t = kzalloc(size, GFP_KERNEL);
+ if (dhugetlb_pagelist_t) {
+ dhugetlb_pagelist_t->count = count;
+ static_branch_enable(&dhugetlb_enabled_key);
+ pr_info("Dynamic 1G hugepage enabled\n");
+ } else
+ pr_info("Dynamic 1G hugepage disabled due to out of memory, need %lu\n",
+ size);
+ }
+#endif
+
#ifdef CONFIG_SMP
num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
#else
@@ -3270,6 +4393,16 @@ static int __init hugetlb_nrpages_setup(char *s)
}
__setup("hugepages=", hugetlb_nrpages_setup);
+#ifdef CONFIG_DYNAMIC_HUGETLB
+static int __init dhugetlb_setup(char *s)
+{
+ if (!strcmp(s, "on"))
+ enable_dhugetlb = true;
+ return 1;
+}
+__setup("dynamic_1G_hugepage=", dhugetlb_setup);
+#endif
+
static int __init hugetlb_default_setup(char *s)
{
default_hstate_size = memparse(s, &s);
@@ -3471,10 +4604,14 @@ unsigned long hugetlb_total_pages(void)
return nr_total_pages;
}
-static int hugetlb_acct_memory(struct hstate *h, long delta)
+static int hugetlb_acct_memory(struct hstate *h, long delta,
+ struct dhugetlb_pool *hpool)
{
int ret = -ENOMEM;
+ if (dhugetlb_enabled && hpool)
+ return dhugetlb_acct_memory(h, delta, hpool);
+
spin_lock(&hugetlb_lock);
/*
* When cpuset is configured, it breaks the strict hugetlb page
@@ -3535,6 +4672,8 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
struct hugepage_subpool *spool = subpool_vma(vma);
unsigned long reserve, start, end;
long gbl_reserve;
+ struct dhugetlb_pool *hpool =
+ HUGETLBFS_I(file_inode(vma->vm_file))->hpool;
if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
return;
@@ -3551,8 +4690,8 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
* Decrement reserve counts. The global reserve count may be
* adjusted if the subpool has a minimum size.
*/
- gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
- hugetlb_acct_memory(h, -gbl_reserve);
+ gbl_reserve = hugepage_subpool_put_pages(spool, reserve, hpool);
+ hugetlb_acct_memory(h, -gbl_reserve, hpool);
}
}
@@ -4934,6 +6073,7 @@ int hugetlb_reserve_pages(struct inode *inode,
struct hugepage_subpool *spool = subpool_inode(inode);
struct resv_map *resv_map;
long gbl_reserve;
+ struct dhugetlb_pool *hpool = HUGETLBFS_I(inode)->hpool;
/* This should never happen */
if (from > to) {
@@ -4986,7 +6126,7 @@ int hugetlb_reserve_pages(struct inode *inode,
* the subpool has a minimum size, there may be some global
* reservations already in place (gbl_reserve).
*/
- gbl_reserve = hugepage_subpool_get_pages(spool, chg);
+ gbl_reserve = hugepage_subpool_get_pages(spool, chg, hpool);
if (gbl_reserve < 0) {
ret = -ENOSPC;
goto out_err;
@@ -4996,10 +6136,10 @@ int hugetlb_reserve_pages(struct inode *inode,
* Check enough hugepages are available for the reservation.
* Hand the pages back to the subpool if there are not
*/
- ret = hugetlb_acct_memory(h, gbl_reserve);
+ ret = hugetlb_acct_memory(h, gbl_reserve, hpool);
if (ret < 0) {
/* put back original number of pages, chg */
- (void)hugepage_subpool_put_pages(spool, chg);
+ (void)hugepage_subpool_put_pages(spool, chg, hpool);
goto out_err;
}
@@ -5028,8 +6168,9 @@ int hugetlb_reserve_pages(struct inode *inode,
long rsv_adjust;
rsv_adjust = hugepage_subpool_put_pages(spool,
- chg - add);
- hugetlb_acct_memory(h, -rsv_adjust);
+ chg - add,
+ hpool);
+ hugetlb_acct_memory(h, -rsv_adjust, hpool);
}
}
return 0;
@@ -5051,6 +6192,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
long chg = 0;
struct hugepage_subpool *spool = subpool_inode(inode);
long gbl_reserve;
+ struct dhugetlb_pool *hpool = HUGETLBFS_I(inode)->hpool;
/*
* Since this routine can be called in the evict inode path for all
@@ -5075,8 +6217,8 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
* If the subpool has a minimum size, the number of global
* reservations to be released may be adjusted.
*/
- gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
- hugetlb_acct_memory(h, -gbl_reserve);
+ gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed), hpool);
+ hugetlb_acct_memory(h, -gbl_reserve, hpool);
return 0;
}
diff --git a/mm/internal.h b/mm/internal.h
index 1b861446c751..deffd247b010 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -182,6 +182,7 @@ extern void __putback_isolated_page(struct page *page, unsigned int order,
int mt);
extern void __free_pages_core(struct page *page, unsigned int order);
extern void prep_compound_page(struct page *page, unsigned int order);
+extern int check_new_page(struct page *page);
extern void post_alloc_hook(struct page *page, unsigned int order,
gfp_t gfp_flags);
extern int user_min_free_kbytes;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 63b91a030b02..bdc90e6fc082 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -997,6 +997,41 @@ static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
return get_mem_cgroup_from_mm(current->mm);
}
+#ifdef CONFIG_DYNAMIC_HUGETLB
+void free_page_to_dhugetlb_pool(struct page *page)
+{
+ struct dhugetlb_pool *hpool;
+ struct small_page_pool *smpool;
+ unsigned long flags;
+
+ hpool = get_dhugetlb_pool_from_dhugetlb_pagelist(page);
+ if (unlikely(!hpool)) {
+ pr_err("dhugetlb: free error: get hpool failed\n");
+ return;
+ }
+
+ smpool = &hpool->smpool[smp_processor_id()];
+ spin_lock_irqsave(&smpool->lock, flags);
+
+ ClearPagePool(page);
+ if (!free_pages_prepare(page, 0, false)) {
+ SetPagePool(page);
+ goto out;
+ }
+ list_add(&page->lru, &smpool->head_page);
+ smpool->free_pages++;
+ smpool->used_pages--;
+ if (smpool->free_pages > MAX_SMPOOL_PAGE) {
+ spin_lock(&hpool->lock);
+ move_pages_from_smpool_to_hpool(hpool, smpool);
+ spin_unlock(&hpool->lock);
+ }
+out:
+ spin_unlock_irqrestore(&smpool->lock, flags);
+ dhugetlb_pool_put(hpool);
+}
+#endif /* CONFIG_DYNAMIC_HUGETLB */
+
/**
* mem_cgroup_iter - iterate over memory cgroup hierarchy
* @root: hierarchy root
@@ -3118,6 +3153,31 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
return 0;
}
+#ifdef CONFIG_DYNAMIC_HUGETLB
+int dhugetlb_pool_force_empty(struct mem_cgroup *memcg)
+{
+ lru_add_drain_all();
+
+ drain_all_stock(memcg);
+
+ while (page_counter_read(&memcg->memory)) {
+ int progress;
+
+ if (signal_pending(current))
+ return -EINTR;
+
+ progress = try_to_free_mem_cgroup_pages(memcg, 1,
+ GFP_HIGHUSER_MOVABLE,
+ false);
+
+ if (!progress) {
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
+ break;
+ }
+ }
+ return 0;
+}
+#endif
static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
@@ -4652,6 +4712,305 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
return ret;
}
+#ifdef CONFIG_DYNAMIC_HUGETLB
+struct dhugetlb_pool *get_dhugetlb_pool_from_memcg(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup_extension *memcg_ext;
+
+ if (!memcg)
+ return NULL;
+
+ memcg_ext = container_of(memcg, struct mem_cgroup_extension, memcg);
+ if (dhugetlb_pool_get(memcg_ext->hpool))
+ return memcg_ext->hpool;
+ return NULL;
+}
+
+static void set_dhugetlb_pool_to_memcg(struct mem_cgroup *memcg,
+ struct dhugetlb_pool *hpool)
+{
+ struct mem_cgroup_extension *memcg_ext;
+
+ memcg_ext = container_of(memcg, struct mem_cgroup_extension, memcg);
+
+ memcg_ext->hpool = hpool;
+}
+
+static bool should_allocate_from_dhugetlb_pool(gfp_t gfp_mask)
+{
+ gfp_t gfp = gfp_mask & GFP_HIGHUSER_MOVABLE;
+
+ if (current->flags & PF_KTHREAD)
+ return false;
+
+ /*
+ * The cgroup only charges anonymous and file pages from usespage.
+ * some filesystem maybe has masked out the __GFP_IO | __GFP_FS
+ * to avoid recursive memory request. eg: loop device, xfs.
+ */
+ if ((gfp | __GFP_IO | __GFP_FS) != GFP_HIGHUSER_MOVABLE)
+ return false;
+
+ return true;
+}
+
+static struct page *__alloc_page_from_dhugetlb_pool(void)
+{
+ bool ret;
+ struct dhugetlb_pool *hpool;
+ struct small_page_pool *smpool;
+ struct page *page = NULL;
+ unsigned long flags;
+
+ hpool = get_dhugetlb_pool_from_task(current);
+ if (unlikely(!hpool))
+ goto out;
+
+ smpool = &hpool->smpool[smp_processor_id()];
+ spin_lock_irqsave(&smpool->lock, flags);
+
+ if (smpool->free_pages == 0) {
+ spin_lock(&hpool->lock);
+ ret = move_pages_from_hpool_to_smpool(hpool, smpool);
+ spin_unlock(&hpool->lock);
+ if (!ret)
+ goto unlock;
+ }
+
+ page = list_entry(smpool->head_page.next, struct page, lru);
+ list_del(&page->lru);
+ smpool->free_pages--;
+ smpool->used_pages++;
+ check_new_page(page);
+ SetPagePool(page);
+unlock:
+ spin_unlock_irqrestore(&smpool->lock, flags);
+out:
+ dhugetlb_pool_put(hpool);
+ return page;
+}
+
+struct page *alloc_page_from_dhugetlb_pool(gfp_t gfp_mask)
+{
+ struct page *page = NULL;
+
+ if (should_allocate_from_dhugetlb_pool(gfp_mask))
+ page = __alloc_page_from_dhugetlb_pool();
+
+ return page;
+}
+
+static void assign_new_dhugetlb_pool(struct mem_cgroup *memcg,
+ unsigned long nid)
+{
+ struct dhugetlb_pool *hpool;
+
+ hpool = hpool_alloc(nid);
+ if (!hpool)
+ return;
+
+ hpool->attach_memcg = memcg;
+ css_get(&memcg->css);
+ set_dhugetlb_pool_to_memcg(memcg, hpool);
+}
+
+static int update_dhugetlb_pool(struct mem_cgroup *memcg,
+ unsigned long nid, unsigned long size)
+{
+ int ret;
+ struct dhugetlb_pool *hpool = get_dhugetlb_pool_from_memcg(memcg);
+
+ if (!hpool) {
+ if (memcg_has_children(memcg))
+ return -EINVAL;
+ assign_new_dhugetlb_pool(memcg, nid);
+ hpool = get_dhugetlb_pool_from_memcg(memcg);
+ }
+ if (!hpool)
+ return -ENOMEM;
+ if (hpool->attach_memcg != memcg || hpool->nid != nid) {
+ dhugetlb_pool_put(hpool);
+ return -EINVAL;
+ }
+
+ ret = alloc_hugepage_from_hugetlb(hpool, nid, size);
+
+ dhugetlb_pool_put(hpool);
+ return ret;
+}
+
+/*
+ * Test whether an process can allocate specified memory size.
+ *
+ * Input must be in format '<nid> <size>'.
+ * size is regarded as how many it does 1G huge page.
+ */
+static ssize_t memcg_write_dhugetlb(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ int ret;
+ unsigned long nid, size;
+ char *endp;
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+
+ if (!dhugetlb_enabled)
+ return -EINVAL;
+
+ buf = strstrip(buf);
+ nid = memparse(buf, &endp);
+ if (*endp != ' ' || nid >= MAX_NUMNODES)
+ return -EINVAL;
+
+ buf = endp + 1;
+ size = memparse(buf, &endp);
+ if (*endp != '\0' || size == 0)
+ return -EINVAL;
+
+ ret = update_dhugetlb_pool(memcg, nid, size);
+
+ return ret ?: nbytes;
+}
+
+static int memcg_read_dhugetlb(struct seq_file *m, void *v)
+{
+ int i;
+ unsigned long free_pages;
+ long used_pages = 0;
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct dhugetlb_pool *hpool = get_dhugetlb_pool_from_memcg(memcg);
+
+ if (!dhugetlb_enabled)
+ return 0;
+ if (!hpool) {
+ seq_printf(m, "Curent hierarchial have not memory pool.\n");
+ return 0;
+ }
+
+ for (i = 0; i < NR_SMPOOL; i++)
+ spin_lock(&hpool->smpool[i].lock);
+ spin_lock(&hpool->lock);
+
+ free_pages = hpool->free_pages;
+ for (i = 0; i < NR_SMPOOL; i++) {
+ free_pages += hpool->smpool[i].free_pages;
+ used_pages += hpool->smpool[i].used_pages;
+ }
+
+ seq_printf(m, "dhugetlb_total_pages %ld\n"
+ "1G_total_reserved_pages %ld\n"
+ "1G_free_reserved_pages %ld\n"
+ "1G_mmap_reserved_pages %ld\n"
+ "1G_used_pages %ld\n"
+ "1G_free_unreserved_pages %ld\n"
+ "2M_total_reserved_pages %ld\n"
+ "2M_free_reserved_pages %ld\n"
+ "2M_mmap_reserved_pages %ld\n"
+ "2M_used_pages %ld\n"
+ "2M_free_unreserved_pages %ld\n"
+ "4K_free_pages %ld\n"
+ "4K_used_pages %ld\n",
+ hpool->total_nr_pages,
+ hpool->total_reserved_1G,
+ hpool->free_reserved_1G,
+ hpool->mmap_reserved_1G,
+ hpool->used_1G,
+ hpool->free_unreserved_1G,
+ hpool->total_reserved_2M,
+ hpool->free_reserved_2M,
+ hpool->mmap_reserved_2M,
+ hpool->used_2M,
+ hpool->free_unreserved_2M,
+ free_pages,
+ used_pages);
+
+ spin_unlock(&hpool->lock);
+ for (i = NR_SMPOOL - 1; i >= 0; i--)
+ spin_unlock(&hpool->smpool[i].lock);
+ dhugetlb_pool_put(hpool);
+ return 0;
+}
+
+static int update_reserve_pages(struct kernfs_open_file *of,
+ char *buf, bool gigantic)
+{
+ unsigned long size;
+ char *endp;
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ struct dhugetlb_pool *hpool;
+
+ if (!dhugetlb_enabled)
+ return -EINVAL;
+
+ buf = strstrip(buf);
+ size = memparse(buf, &endp);
+ if (*endp != '\0')
+ return -EINVAL;
+
+ hpool = get_dhugetlb_pool_from_memcg(memcg);
+ if (!hpool)
+ return -EINVAL;
+ spin_lock(&hpool->reserved_lock);
+ dhugetlb_reserve_hugepages(hpool, size, gigantic);
+ spin_unlock(&hpool->reserved_lock);
+ dhugetlb_pool_put(hpool);
+ return 0;
+}
+
+static ssize_t dhugetlb_1G_reserve_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return update_reserve_pages(of, buf, true) ?: nbytes;
+}
+
+static ssize_t dhugetlb_2M_reserve_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return update_reserve_pages(of, buf, false) ?: nbytes;
+}
+
+static void dhugetlb_pool_inherits(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent)
+{
+ struct dhugetlb_pool *hpool;
+
+ hpool = get_dhugetlb_pool_from_memcg(parent);
+ if (!hpool)
+ return;
+
+ set_dhugetlb_pool_to_memcg(memcg, hpool);
+ dhugetlb_pool_put(hpool);
+}
+
+static bool dhugetlb_pool_free(struct mem_cgroup *memcg)
+{
+ bool ret = true;
+ struct dhugetlb_pool *hpool;
+
+ hpool = get_dhugetlb_pool_from_memcg(memcg);
+ if (hpool && hpool->attach_memcg == memcg)
+ ret = free_dhugetlb_pool(hpool);
+ dhugetlb_pool_put(hpool);
+ return ret;
+}
+
+bool dhugetlb_pool_is_free(struct cgroup_subsys_state *css)
+{
+ if (dhugetlb_enabled)
+ return dhugetlb_pool_free(mem_cgroup_from_css(css));
+ return true;
+}
+#else
+static void dhugetlb_pool_inherits(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent)
+{
+}
+
+bool dhugetlb_pool_is_free(struct cgroup_subsys_state *css)
+{
+ return true;
+}
+#endif /* CONFIG_DYNAMIC_HUGETLB */
+
static struct cftype mem_cgroup_legacy_files[] = {
{
.name = "usage_in_bytes",
@@ -4700,6 +5059,27 @@ static struct cftype mem_cgroup_legacy_files[] = {
.write = memcg_write_event_control,
.flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
},
+#ifdef CONFIG_DYNAMIC_HUGETLB
+ {
+ .name = "dhugetlb.nr_pages",
+ .write = memcg_write_dhugetlb,
+ .seq_show = memcg_read_dhugetlb,
+ .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE |
+ CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "dhugetlb.1G.reserved_pages",
+ .write = dhugetlb_1G_reserve_write,
+ .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE |
+ CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "dhugetlb.2M.reserved_pages",
+ .write = dhugetlb_2M_reserve_write,
+ .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE |
+ CFTYPE_NOT_ON_ROOT,
+ },
+#endif
{
.name = "swappiness",
.read_u64 = mem_cgroup_swappiness_read,
@@ -5063,6 +5443,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
return &memcg->css;
}
+ if (dhugetlb_enabled)
+ dhugetlb_pool_inherits(memcg, parent);
+
error = memcg_online_kmem(memcg);
if (error)
goto fail;
@@ -5681,6 +6064,14 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
if (!p)
return 0;
+ if (dhugetlb_enabled) {
+ struct dhugetlb_pool *hpool = get_dhugetlb_pool_from_task(p);
+
+ if (hpool) {
+ dhugetlb_pool_put(hpool);
+ return -EPERM;
+ }
+ }
/*
* We are now commited to this value whatever it is. Changes in this
* tunable will only affect upcoming migrations, not the current one.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a6a2f254f61f..e722d73a3724 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1052,7 +1052,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
return ret;
}
-static __always_inline bool free_pages_prepare(struct page *page,
+__always_inline bool free_pages_prepare(struct page *page,
unsigned int order, bool check_free)
{
int bad = 0;
@@ -2012,7 +2012,7 @@ static void check_new_page_bad(struct page *page)
/*
* This page is about to be returned from the page allocator
*/
-static inline int check_new_page(struct page *page)
+inline int check_new_page(struct page *page)
{
if (likely(page_expected_state(page,
PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
@@ -2075,8 +2075,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
set_page_owner(page, order, gfp_flags);
}
-static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
- unsigned int alloc_flags)
+void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
+ unsigned int alloc_flags)
{
int i;
@@ -2955,6 +2955,12 @@ void free_unref_page(struct page *page)
unsigned long flags;
unsigned long pfn = page_to_pfn(page);
+ /* Free dynamic hugetlb page */
+ if (dhugetlb_enabled && PagePool(page)) {
+ free_page_to_dhugetlb_pool(page);
+ return;
+ }
+
if (!free_unref_page_prepare(page, pfn))
return;
@@ -2972,6 +2978,16 @@ void free_unref_page_list(struct list_head *list)
unsigned long flags, pfn;
int batch_count = 0;
+ /* Free dynamic hugetlb pages */
+ if (dhugetlb_enabled) {
+ list_for_each_entry_safe(page, next, list, lru) {
+ if (PagePool(page)) {
+ list_del(&page->lru);
+ free_page_to_dhugetlb_pool(page);
+ }
+ }
+ }
+
/* Prepare pages for freeing */
list_for_each_entry_safe(page, next, list, lru) {
pfn = page_to_pfn(page);
@@ -4785,6 +4801,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
finalise_ac(gfp_mask, &ac);
+ /* Dynamic hugetlb allocation attemp */
+ if (dhugetlb_enabled && likely(order == 0)) {
+ page = alloc_page_from_dhugetlb_pool(gfp_mask);
+ if (page) {
+ prep_new_page(page, order, gfp_mask, alloc_flags);
+ goto out;
+ }
+ }
+
/* First allocation attempt */
page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
if (likely(page))
--
2.25.1
1
5