hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4K3S5
--------------------------
Generalize naming of some kabi helper macros.
Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/kabi.h | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-)
diff --git a/include/linux/kabi.h b/include/linux/kabi.h index 713f63cf56cb..0bc7ca2483f4 100644 --- a/include/linux/kabi.h +++ b/include/linux/kabi.h @@ -151,7 +151,7 @@ * approaches can (and often are) combined. * * To use this for 'struct foo' (the "base structure"), define a new - * structure called 'struct foo_rh'; this new struct is called "auxiliary + * structure called 'struct foo_resvd'; this new struct is called "auxiliary * structure". Then add KABI_AUX_EMBED or KABI_AUX_PTR to the end * of the base structure. The argument is the name of the base structure, * without the 'struct' keyword. @@ -174,7 +174,7 @@ * end. Note the auxiliary structure cannot be shrunk in size later (i.e., * fields cannot be removed, only deprecated). Any code accessing fields * from the aux struct must guard the access using the KABI_AUX macro. - * The access itself is then done via a '_rh' field in the base struct. + * The access itself is then done via a '_resvd' field in the base struct. * * The auxiliary structure is not guaranteed for access by modules unless * explicitly commented as such in the declaration of the aux struct @@ -182,7 +182,7 @@ * * Example: * - * struct foo_rh { + * struct foo_resvd { * int newly_added; * }; * @@ -194,20 +194,20 @@ * void use(struct foo *f) * { * if (KABI_AUX(f, foo, newly_added)) - * f->_rh->newly_added = 123; + * f->_resvd->newly_added = 123; * else * // the field 'newly_added' is not present in the passed * // struct, fall back to old behavior * f->big_hammer = true; * } * - * static struct foo_rh my_foo_rh { + * static struct foo_resvd my_foo_resvd { * .newly_added = 0; * } * * static struct foo my_foo = { * .big_hammer = false, - * ._rh = &my_foo_rh, + * ._resvd = &my_foo_resvd, * KABI_AUX_INIT_SIZE(foo) * }; * @@ -218,7 +218,7 @@ * * Example: * - * struct foo_rh { + * struct foo_resvd { * }; * * struct foo { @@ -385,7 +385,7 @@ _Static_assert(__alignof__(struct{_new;}) <= __alignof__(struct{_orig;}), \ __FILE__ ":" __stringify(__LINE__) ": " __stringify(_orig) " is not aligned the same as " __stringify(_new) KABI_ALIGN_WARNING); \ } -# define __ABI_CHECK_SIZE(_item, _size) \ +# define __KABI_CHECK_SIZE(_item, _size) \ _Static_assert(sizeof(struct{_item;}) <= _size, \ __FILE__ ":" __stringify(__LINE__) ": " __stringify(_item) " is larger than the reserved size (" __stringify(_size) " bytes)" KABI_ALIGN_WARNING) #else @@ -451,14 +451,14 @@ })
#define _KABI_AUX_PTR(_struct) \ - size_t _struct##_size_rh; \ - _KABI_EXCLUDE(struct _struct##_rh *_rh) + size_t _struct##_size_resvd; \ + _KABI_EXCLUDE(struct _struct##_resvd *_resvd) #define KABI_AUX_PTR(_struct) \ _KABI_AUX_PTR(_struct);
#define _KABI_AUX_EMBED(_struct) \ - size_t _struct##_size_rh; \ - _KABI_EXCLUDE(struct _struct##_rh _rh) + size_t _struct##_size_resvd; \ + _KABI_EXCLUDE(struct _struct##_resvd _resvd) #define KABI_AUX_EMBED(_struct) \ _KABI_AUX_EMBED(_struct);
@@ -468,7 +468,7 @@
/* * KABI_AUX_SET_SIZE calculates and sets the size of the extended struct and - * stores it in the size_rh field for structs that are dynamically allocated. + * stores it in the size_resvd field for structs that are dynamically allocated. * This macro MUST be called when expanding a base struct with * KABI_SIZE_AND_EXTEND, and it MUST be called from the allocation site * regardless of being allocated in the kernel or a module. @@ -476,28 +476,28 @@ * a semicolon is necessary at the end of the line where it is invoked. */ #define KABI_AUX_SET_SIZE(_name, _struct) ({ \ - (_name)->_struct##_size_rh = sizeof(struct _struct##_rh); \ + (_name)->_struct##_size_resvd = sizeof(struct _struct##_resvd); \ })
/* * KABI_AUX_INIT_SIZE calculates and sets the size of the extended struct and - * stores it in the size_rh field for structs that are statically allocated. + * stores it in the size_resvd field for structs that are statically allocated. * This macro MUST be called when expanding a base struct with * KABI_SIZE_AND_EXTEND, and it MUST be called from the declaration site * regardless of being allocated in the kernel or a module. */ #define KABI_AUX_INIT_SIZE(_struct) \ - ._struct##_size_rh = sizeof(struct _struct##_rh), + ._struct##_size_resvd = sizeof(struct _struct##_resvd),
/* * KABI_AUX verifies allocated memory exists. This MUST be called to - * verify that memory in the _rh struct is valid, and can be called + * verify that memory in the _resvd struct is valid, and can be called * regardless if KABI_SIZE_AND_EXTEND or KABI_SIZE_AND_EXTEND_PTR is * used. */ #define KABI_AUX(_ptr, _struct, _field) ({ \ - size_t __off = offsetof(struct _struct##_rh, _field); \ - (_ptr)->_struct##_size_rh > __off ? true : false; \ + size_t __off = offsetof(struct _struct##_resvd, _field); \ + (_ptr)->_struct##_size_resvd > __off ? true : false; \ })
#endif /* _LINUX_KABI_H */
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4JBL0 CVE: NA
------------------------------
Add KABI_AUX_PTR extenstions to the following base structures before KABI freeze:
struct device_driver struct class struct device struct hrtimer struct ipmi_smi_handlers struct net_device
Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/device.h | 9 ++++++++- include/linux/device/class.h | 7 +++++++ include/linux/device/driver.h | 7 +++++++ include/linux/hrtimer.h | 7 +++++++ include/linux/ipmi_smi.h | 10 ++++++++++ include/linux/netdevice.h | 7 +++++++ 6 files changed, 46 insertions(+), 1 deletion(-)
diff --git a/include/linux/device.h b/include/linux/device.h index 2e95153942d1..929b66dc5ea9 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -370,6 +370,12 @@ struct dev_links_info { enum dl_dev_state status; };
+/** + * struct device_extended_resvd - KABI extension struct + */ +struct device_extended_resvd { +}; + /** * struct device - The basic device structure * @parent: The device's "parent" device, the device to which it is attached. @@ -560,7 +566,7 @@ struct device { #ifdef CONFIG_DMA_OPS_BYPASS bool dma_ops_bypass : 1; #endif - + /* Use device_extended after all RESERVE fields used */ KABI_RESERVE(1) KABI_RESERVE(2) KABI_RESERVE(3) @@ -577,6 +583,7 @@ struct device { KABI_RESERVE(14) KABI_RESERVE(15) KABI_RESERVE(16) + KABI_AUX_PTR(device_extended) };
/** diff --git a/include/linux/device/class.h b/include/linux/device/class.h index d152ddce543a..f2071ee10d61 100644 --- a/include/linux/device/class.h +++ b/include/linux/device/class.h @@ -24,6 +24,12 @@ struct device; struct fwnode_handle;
+/** + * struct class_resvd - KABI extension struct + */ +struct class_resvd { +}; + /** * struct class - device classes * @name: Name of the class. @@ -82,6 +88,7 @@ struct class { KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) + KABI_AUX_PTR(class) };
struct class_dev_iter { diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h index 9473256006e5..b814f1bd79f6 100644 --- a/include/linux/device/driver.h +++ b/include/linux/device/driver.h @@ -48,6 +48,12 @@ enum probe_type { PROBE_FORCE_SYNCHRONOUS, };
+/** + * struct device_driver_resvd - KABI extension struct + */ +struct device_driver_resvd { +}; + /** * struct device_driver - The basic device driver structure * @name: Name of the device driver. @@ -124,6 +130,7 @@ struct device_driver { KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) + KABI_AUX_PTR(device_driver) };
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 1525853610b0..b1f2e4692e66 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -95,6 +95,12 @@ enum hrtimer_restart { #define HRTIMER_STATE_INACTIVE 0x00 #define HRTIMER_STATE_ENQUEUED 0x01
+/** + * struct hrtimer_resvd - KABI extension struct + */ +struct hrtimer_resvd { +}; + /** * struct hrtimer - the basic hrtimer structure * @node: timerqueue node, which also manages node.expires, @@ -128,6 +134,7 @@ struct hrtimer {
KABI_RESERVE(1) KABI_RESERVE(2) + KABI_AUX_PTR(hrtimer) };
/** diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h index deec18b8944a..99ad621b0350 100644 --- a/include/linux/ipmi_smi.h +++ b/include/linux/ipmi_smi.h @@ -70,6 +70,15 @@ struct ipmi_smi_msg { void (*done)(struct ipmi_smi_msg *msg); };
+/** + * struct ipmi_smi_handlers_resvd - KABI extension struct + * This extension must be dynamically allocated for every instance of + * ipmi_smi_handlers, because ipmi_smi_handlers is embedded in another + * struct. + */ +struct ipmi_smi_handlers_resvd { +}; + struct ipmi_smi_handlers { struct module *owner;
@@ -153,6 +162,7 @@ struct ipmi_smi_handlers { * block. */ void (*set_maintenance_mode)(void *send_info, bool enable); + KABI_AUX_PTR(ipmi_smi_handlers) };
struct ipmi_device_id { diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1335981945d9..de1fb4ec4d50 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1639,6 +1639,12 @@ enum netdev_ml_priv_type { ML_PRIV_CAN, };
+/** + * struct net_device_extended_resvd - KABI extension struct + */ +struct net_device_extended_resvd { +}; + /** * struct net_device - The DEVICE structure. * @@ -2215,6 +2221,7 @@ struct net_device { KABI_RESERVE(6) KABI_RESERVE(7) KABI_RESERVE(8) + KABI_AUX_PTR(net_device_extended) }; #define to_net_dev(d) container_of(d, struct net_device, dev)
From: Stanislav Fomichev sdf@google.com
mainline inclusion from mainline-v5.11-rc1 commit 427167c0b064ed898b848209add62b4322ec7840 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4GII8?from=project-issue CVE: NA
---------
I have to now lock/unlock socket for the bind hook execution. That shouldn't cause any overhead because the socket is unbound and shouldn't receive any traffic.
Signed-off-by: Stanislav Fomichev sdf@google.com Signed-off-by: Alexei Starovoitov ast@kernel.org Acked-by: Andrey Ignatov rdna@fb.com Link: https://lore.kernel.org/bpf/20201202172516.3483656-3-sdf@google.com Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Wei Yongjunweiyongjun1@huawei.com Reviewed-by: weiyang wang wangweiyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/bpf-cgroup.h | 12 ++++++------ net/core/filter.c | 4 ++++ net/ipv4/af_inet.c | 2 +- net/ipv6/af_inet6.c | 2 +- 4 files changed, 12 insertions(+), 8 deletions(-)
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 7014d2ade050..2e4550514e21 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -253,11 +253,11 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, __ret; \ })
-#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_BIND) +#define BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_BIND, NULL)
-#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_BIND) +#define BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_BIND, NULL)
#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (cgroup_bpf_enabled && \ sk->sk_prot->pre_connect) @@ -439,8 +439,8 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; }) diff --git a/net/core/filter.c b/net/core/filter.c index abd58dce49bb..42d15942ce90 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6995,6 +6995,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_delete_proto; case BPF_FUNC_setsockopt: switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: return &bpf_sock_addr_setsockopt_proto; @@ -7003,6 +7005,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } case BPF_FUNC_getsockopt: switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: return &bpf_sock_addr_getsockopt_proto; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 8267349afe23..fdc0ac586489 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -450,7 +450,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) /* BPF prog is run before any checks are done so that if the prog * changes context in a wrong way it will be caught. */ - err = BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr); + err = BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr); if (err) return err;
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index e648fbebb167..a7e3d170af51 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -451,7 +451,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) /* BPF prog is run before any checks are done so that if the prog * changes context in a wrong way it will be caught. */ - err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr); + err = BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr); if (err) return err;
From: Stanislav Fomichev sdf@google.com
mainline inclusion from mainline-v5.12-rc1 commit 20f2505fb436cfa674cf1f46aaa624f44d3d1d03 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4GII8?from=project-issue CVE: NA
--------------
When we attach a bpf program to cgroup/getsockopt any other getsockopt() syscall starts incurring kzalloc/kfree cost.
Let add a small buffer on the stack and use it for small (majority) {s,g}etsockopt values. The buffer is small enough to fit into the cache line and cover the majority of simple options (most of them are 4 byte ints).
It seems natural to do the same for setsockopt, but it's a bit more involved when the BPF program modifies the data (where we have to kmalloc). The assumption is that for the majority of setsockopt calls (which are doing pure BPF options or apply policy) this will bring some benefit as well.
Without this patch (we remove about 1% __kmalloc): 3.38% 0.07% tcp_mmap [kernel.kallsyms] [k] __cgroup_bpf_run_filter_getsockopt | --3.30%--__cgroup_bpf_run_filter_getsockopt | --0.81%--__kmalloc
Signed-off-by: Stanislav Fomichev sdf@google.com Signed-off-by: Alexei Starovoitov ast@kernel.org Acked-by: Martin KaFai Lau kafai@fb.com Link: https://lore.kernel.org/bpf/20210115163501.805133-3-sdf@google.com Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Wei Yongjunweiyongjun1@huawei.com Reviewed-by: weiyang wang wangweiyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/filter.h | 5 ++++ kernel/bpf/cgroup.c | 52 ++++++++++++++++++++++++++++++++++++------ 2 files changed, 50 insertions(+), 7 deletions(-)
diff --git a/include/linux/filter.h b/include/linux/filter.h index ca1f09aa9034..45ba2a61f9e5 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1297,6 +1297,11 @@ struct bpf_sysctl_kern { u64 tmp_reg; };
+#define BPF_SOCKOPT_KERN_BUF_SIZE 32 +struct bpf_sockopt_buf { + u8 data[BPF_SOCKOPT_KERN_BUF_SIZE]; +}; + struct bpf_sockopt_kern { struct sock *sk; u8 *optval; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 6aa9e10c6335..ae270d6f97f9 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1298,7 +1298,8 @@ static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, return empty; }
-static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) +static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen, + struct bpf_sockopt_buf *buf) { if (unlikely(max_optlen < 0)) return -EINVAL; @@ -1310,6 +1311,15 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) max_optlen = PAGE_SIZE; }
+ if (max_optlen <= sizeof(buf->data)) { + /* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE + * bytes avoid the cost of kzalloc. + */ + ctx->optval = buf->data; + ctx->optval_end = ctx->optval + max_optlen; + return max_optlen; + } + ctx->optval = kzalloc(max_optlen, GFP_USER); if (!ctx->optval) return -ENOMEM; @@ -1319,16 +1329,26 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) return max_optlen; }
-static void sockopt_free_buf(struct bpf_sockopt_kern *ctx) +static void sockopt_free_buf(struct bpf_sockopt_kern *ctx, + struct bpf_sockopt_buf *buf) { + if (ctx->optval == buf->data) + return; kfree(ctx->optval); }
+static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx, + struct bpf_sockopt_buf *buf) +{ + return ctx->optval != buf->data; +} + int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, int *optname, char __user *optval, int *optlen, char **kernel_optval) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_buf buf = {}; struct bpf_sockopt_kern ctx = { .sk = sk, .level = *level, @@ -1350,7 +1370,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, */ max_optlen = max_t(int, 16, *optlen);
- max_optlen = sockopt_alloc_buf(&ctx, max_optlen); + max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); if (max_optlen < 0) return max_optlen;
@@ -1390,14 +1410,31 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, */ if (ctx.optlen != 0) { *optlen = ctx.optlen; - *kernel_optval = ctx.optval; + /* We've used bpf_sockopt_kern->buf as an intermediary + * storage, but the BPF program indicates that we need + * to pass this data to the kernel setsockopt handler. + * No way to export on-stack buf, have to allocate a + * new buffer. + */ + if (!sockopt_buf_allocated(&ctx, &buf)) { + void *p = kmalloc(ctx.optlen, GFP_USER); + + if (!p) { + ret = -ENOMEM; + goto out; + } + memcpy(p, ctx.optval, ctx.optlen); + *kernel_optval = p; + } else { + *kernel_optval = ctx.optval; + } /* export and don't free sockopt buf */ return 0; } }
out: - sockopt_free_buf(&ctx); + sockopt_free_buf(&ctx, &buf); return ret; }
@@ -1407,6 +1444,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, int retval) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_buf buf = {}; struct bpf_sockopt_kern ctx = { .sk = sk, .level = level, @@ -1425,7 +1463,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
ctx.optlen = max_optlen;
- max_optlen = sockopt_alloc_buf(&ctx, max_optlen); + max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); if (max_optlen < 0) return max_optlen;
@@ -1488,7 +1526,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, ret = ctx.retval;
out: - sockopt_free_buf(&ctx); + sockopt_free_buf(&ctx, &buf); return ret; } #endif
From: Stanislav Fomichev sdf@google.com
mainline inclusion from mainline-v5.12-rc1 commit a9ed15dae0755a0368735e0556a462d8519bdb05 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4GII8?from=project-issue CVE: NA
------------
When we attach any cgroup hook, the rest (even if unused/unattached) start to contribute small overhead. In particular, the one we want to avoid is __cgroup_bpf_run_filter_skb which does two redirections to get to the cgroup and pushes/pulls skb.
Let's split cgroup_bpf_enabled to be per-attach to make sure only used attach types trigger.
I've dropped some existing high-level cgroup_bpf_enabled in some places because BPF_PROG_CGROUP_XXX_RUN macros usually have another cgroup_bpf_enabled check.
I also had to copy-paste BPF_CGROUP_RUN_SA_PROG_LOCK for GETPEERNAME/GETSOCKNAME because type for cgroup_bpf_enabled[type] has to be constant and known at compile time.
Signed-off-by: Stanislav Fomichev sdf@google.com Signed-off-by: Alexei Starovoitov ast@kernel.org Acked-by: Song Liu songliubraving@fb.com Link: https://lore.kernel.org/bpf/20210115163501.805133-4-sdf@google.com Conflict: include/linux/bpf-cgroup.h Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Wei Yongjunweiyongjun1@huawei.com Reviewed-by: weiyang wang wangweiyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/bpf-cgroup.h | 36 +++++++++++++++++++----------------- kernel/bpf/cgroup.c | 14 ++++++-------- net/ipv4/af_inet.c | 9 +++++---- net/ipv4/udp.c | 7 +++---- net/ipv6/af_inet6.c | 9 +++++---- net/ipv6/udp.c | 7 +++---- 6 files changed, 41 insertions(+), 41 deletions(-)
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 2e4550514e21..1dcd74f966c5 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -25,8 +25,8 @@ struct task_struct;
#ifdef CONFIG_CGROUP_BPF
-extern struct static_key_false cgroup_bpf_enabled_key; -#define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key) +extern struct static_key_false cgroup_bpf_enabled_key[MAX_BPF_ATTACH_TYPE]; +#define cgroup_bpf_enabled(type) static_branch_unlikely(&cgroup_bpf_enabled_key[type])
#define for_each_cgroup_storage_type(stype) \ for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++) @@ -192,7 +192,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(BPF_CGROUP_INET_INGRESS)) \ __ret = __cgroup_bpf_run_filter_skb(sk, skb, \ BPF_CGROUP_INET_INGRESS); \ \ @@ -202,7 +202,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled && sk && sk == skb->sk) { \ + if (cgroup_bpf_enabled(BPF_CGROUP_INET_EGRESS) && sk && sk == skb->sk) { \ typeof(sk) __sk = sk_to_full_sk(sk); \ if (sk_fullsock(__sk)) \ __ret = __cgroup_bpf_run_filter_skb(__sk, skb, \ @@ -214,7 +214,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_SK_PROG(sk, type) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) { \ + if (cgroup_bpf_enabled(type)) { \ __ret = __cgroup_bpf_run_filter_sk(sk, type); \ } \ __ret; \ @@ -235,7 +235,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(type)) \ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ NULL); \ __ret; \ @@ -244,7 +244,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) { \ + if (cgroup_bpf_enabled(type)) { \ lock_sock(sk); \ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ t_ctx); \ @@ -259,8 +259,10 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_BIND, NULL)
-#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (cgroup_bpf_enabled && \ - sk->sk_prot->pre_connect) +#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) \ + ((cgroup_bpf_enabled(BPF_CGROUP_INET4_CONNECT) || \ + cgroup_bpf_enabled(BPF_CGROUP_INET6_CONNECT)) && \ + (sk)->sk_prot->pre_connect)
#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_CONNECT) @@ -304,7 +306,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(sock_ops, sk) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(BPF_CGROUP_SOCK_OPS)) \ __ret = __cgroup_bpf_run_filter_sock_ops(sk, \ sock_ops, \ BPF_CGROUP_SOCK_OPS); \ @@ -314,7 +316,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled && (sock_ops)->sk) { \ + if (cgroup_bpf_enabled(BPF_CGROUP_SOCK_OPS) && (sock_ops)->sk) { \ typeof(sk) __sk = sk_to_full_sk((sock_ops)->sk); \ if (__sk && sk_fullsock(__sk)) \ __ret = __cgroup_bpf_run_filter_sock_ops(__sk, \ @@ -327,7 +329,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(BPF_CGROUP_DEVICE)) \ __ret = __cgroup_bpf_check_dev_permission(type, major, minor, \ access, \ BPF_CGROUP_DEVICE); \ @@ -339,7 +341,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(BPF_CGROUP_SYSCTL)) \ __ret = __cgroup_bpf_run_filter_sysctl(head, table, write, \ buf, count, pos, \ BPF_CGROUP_SYSCTL); \ @@ -350,7 +352,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, kernel_optval) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(BPF_CGROUP_SETSOCKOPT)) \ __ret = __cgroup_bpf_run_filter_setsockopt(sock, level, \ optname, optval, \ optlen, \ @@ -361,7 +363,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT)) \ get_user(__ret, optlen); \ __ret; \ }) @@ -370,7 +372,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, max_optlen, retval) \ ({ \ int __ret = retval; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT)) \ __ret = __cgroup_bpf_run_filter_getsockopt(sock, level, \ optname, optval, \ optlen, max_optlen, \ @@ -432,7 +434,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, return 0; }
-#define cgroup_bpf_enabled (0) +#define cgroup_bpf_enabled(type) (0) #define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) ({ 0; }) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ae270d6f97f9..1f6e7df721c3 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -19,7 +19,7 @@
#include "../cgroup/cgroup-internal.h"
-DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); +DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_BPF_ATTACH_TYPE); EXPORT_SYMBOL(cgroup_bpf_enabled_key);
void cgroup_bpf_offline(struct cgroup *cgrp) @@ -128,7 +128,7 @@ static void cgroup_bpf_release(struct work_struct *work) if (pl->link) bpf_cgroup_link_auto_detach(pl->link); kfree(pl); - static_branch_dec(&cgroup_bpf_enabled_key); + static_branch_dec(&cgroup_bpf_enabled_key[type]); } old_array = rcu_dereference_protected( cgrp->bpf.effective[type], @@ -499,7 +499,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, if (old_prog) bpf_prog_put(old_prog); else - static_branch_inc(&cgroup_bpf_enabled_key); + static_branch_inc(&cgroup_bpf_enabled_key[type]); bpf_cgroup_storages_link(new_storage, cgrp, type); return 0;
@@ -698,7 +698,7 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, cgrp->bpf.flags[type] = 0; if (old_prog) bpf_prog_put(old_prog); - static_branch_dec(&cgroup_bpf_enabled_key); + static_branch_dec(&cgroup_bpf_enabled_key[type]); return 0;
cleanup: @@ -1360,8 +1360,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, * attached to the hook so we don't waste time allocating * memory and locking the socket. */ - if (!cgroup_bpf_enabled || - __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) + if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) return 0;
/* Allocate a bit more than the initial user buffer for @@ -1457,8 +1456,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, * attached to the hook so we don't waste time allocating * memory and locking the socket. */ - if (!cgroup_bpf_enabled || - __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) + if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) return retval;
ctx.optlen = max_optlen; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index fdc0ac586489..f3fb7a380914 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -777,18 +777,19 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, return -ENOTCONN; sin->sin_port = inet->inet_dport; sin->sin_addr.s_addr = inet->inet_daddr; + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, + BPF_CGROUP_INET4_GETPEERNAME, + NULL); } else { __be32 addr = inet->inet_rcv_saddr; if (!addr) addr = inet->inet_saddr; sin->sin_port = inet->inet_sport; sin->sin_addr.s_addr = addr; - } - if (cgroup_bpf_enabled) BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - peer ? BPF_CGROUP_INET4_GETPEERNAME : - BPF_CGROUP_INET4_GETSOCKNAME, + BPF_CGROUP_INET4_GETSOCKNAME, NULL); + } memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); return sizeof(*sin); } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 655f0d8a13d3..2884d140764d 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1126,7 +1126,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_unlock(); }
- if (cgroup_bpf_enabled && !connected) { + if (cgroup_bpf_enabled(BPF_CGROUP_UDP4_SENDMSG) && !connected) { err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, (struct sockaddr *)usin, &ipc.addr); if (err) @@ -1860,9 +1860,8 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin);
- if (cgroup_bpf_enabled) - BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, - (struct sockaddr *)sin); + BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, + (struct sockaddr *)sin); }
if (udp_sk(sk)->gro_enabled) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index a7e3d170af51..fc985658dc91 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -527,18 +527,19 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin6_addr = sk->sk_v6_daddr; if (np->sndflow) sin->sin6_flowinfo = np->flow_label; + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, + BPF_CGROUP_INET6_GETPEERNAME, + NULL); } else { if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) sin->sin6_addr = np->saddr; else sin->sin6_addr = sk->sk_v6_rcv_saddr; sin->sin6_port = inet->inet_sport; - } - if (cgroup_bpf_enabled) BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - peer ? BPF_CGROUP_INET6_GETPEERNAME : - BPF_CGROUP_INET6_GETSOCKNAME, + BPF_CGROUP_INET6_GETSOCKNAME, NULL); + } sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, sk->sk_bound_dev_if); return sizeof(*sin); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 8a1863146f34..6e9c1a50afce 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -411,9 +411,8 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } *addr_len = sizeof(*sin6);
- if (cgroup_bpf_enabled) - BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, - (struct sockaddr *)sin6); + BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, + (struct sockaddr *)sin6); }
if (udp_sk(sk)->gro_enabled) @@ -1461,7 +1460,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.saddr = np->saddr; fl6.fl6_sport = inet->inet_sport;
- if (cgroup_bpf_enabled && !connected) { + if (cgroup_bpf_enabled(BPF_CGROUP_UDP6_SENDMSG) && !connected) { err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, (struct sockaddr *)sin6, &fl6.saddr); if (err)
From: Dave Marchevsky davemarchevsky@fb.com
mainline inclusion from mainline-v5.15-rc1 commit 6fc88c354f3af83ffa2c285b86e76c759755693f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4GII8?from=project-issue CVE: NA
----------
Add an enum (cgroup_bpf_attach_type) containing only valid cgroup_bpf attach types and a function to map bpf_attach_type values to the new enum. Inspired by netns_bpf_attach_type.
Then, migrate cgroup_bpf to use cgroup_bpf_attach_type wherever possible. Functionality is unchanged as attach_type_to_prog_type switches in bpf/syscall.c were preventing non-cgroup programs from making use of the invalid cgroup_bpf array slots.
As a result struct cgroup_bpf uses 504 fewer bytes relative to when its arrays were sized using MAX_BPF_ATTACH_TYPE.
bpf_cgroup_storage is notably not migrated as struct bpf_cgroup_storage_key is part of uapi and contains a bpf_attach_type member which is not meant to be opaque. Similarly, bpf_cgroup_link continues to report its bpf_attach_type member to userspace via fdinfo and bpf_link_info.
To ease disambiguation, bpf_attach_type variables are renamed from 'type' to 'atype' when changed to cgroup_bpf_attach_type.
Signed-off-by: Dave Marchevsky davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov ast@kernel.org Link: https://lore.kernel.org/bpf/20210819092420.1984861-2-davemarchevsky@fb.com Conflicts: include/linux/bpf-cgroup.h kernel/bpf/cgroup.c net/ipv4/af_inet.c net/ipv6/af_inet6.c
Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Wei Yongjunweiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/bpf-cgroup.h | 176 ++++++++++++++++++++++----------- include/uapi/linux/bpf.h | 2 +- kernel/bpf/cgroup.c | 154 +++++++++++++++++------------ net/ipv4/af_inet.c | 4 +- net/ipv4/udp.c | 2 +- net/ipv6/af_inet6.c | 4 +- net/ipv6/udp.c | 2 +- tools/include/uapi/linux/bpf.h | 2 +- 8 files changed, 220 insertions(+), 126 deletions(-)
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 1dcd74f966c5..cd73f5cae523 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -24,9 +24,73 @@ struct ctl_table_header; struct task_struct;
#ifdef CONFIG_CGROUP_BPF +enum cgroup_bpf_attach_type { + CGROUP_BPF_ATTACH_TYPE_INVALID = -1, + CGROUP_INET_INGRESS = 0, + CGROUP_INET_EGRESS, + CGROUP_INET_SOCK_CREATE, + CGROUP_SOCK_OPS, + CGROUP_DEVICE, + CGROUP_INET4_BIND, + CGROUP_INET6_BIND, + CGROUP_INET4_CONNECT, + CGROUP_INET6_CONNECT, + CGROUP_INET4_POST_BIND, + CGROUP_INET6_POST_BIND, + CGROUP_UDP4_SENDMSG, + CGROUP_UDP6_SENDMSG, + CGROUP_SYSCTL, + CGROUP_UDP4_RECVMSG, + CGROUP_UDP6_RECVMSG, + CGROUP_GETSOCKOPT, + CGROUP_SETSOCKOPT, + CGROUP_INET4_GETPEERNAME, + CGROUP_INET6_GETPEERNAME, + CGROUP_INET4_GETSOCKNAME, + CGROUP_INET6_GETSOCKNAME, + CGROUP_INET_SOCK_RELEASE, + MAX_CGROUP_BPF_ATTACH_TYPE +}; + +#define CGROUP_ATYPE(type) \ + case BPF_##type: return type + +static inline enum cgroup_bpf_attach_type +to_cgroup_bpf_attach_type(enum bpf_attach_type attach_type) +{ + switch (attach_type) { + CGROUP_ATYPE(CGROUP_INET_INGRESS); + CGROUP_ATYPE(CGROUP_INET_EGRESS); + CGROUP_ATYPE(CGROUP_INET_SOCK_CREATE); + CGROUP_ATYPE(CGROUP_SOCK_OPS); + CGROUP_ATYPE(CGROUP_DEVICE); + CGROUP_ATYPE(CGROUP_INET4_BIND); + CGROUP_ATYPE(CGROUP_INET6_BIND); + CGROUP_ATYPE(CGROUP_INET4_CONNECT); + CGROUP_ATYPE(CGROUP_INET6_CONNECT); + CGROUP_ATYPE(CGROUP_INET4_POST_BIND); + CGROUP_ATYPE(CGROUP_INET6_POST_BIND); + CGROUP_ATYPE(CGROUP_UDP4_SENDMSG); + CGROUP_ATYPE(CGROUP_UDP6_SENDMSG); + CGROUP_ATYPE(CGROUP_SYSCTL); + CGROUP_ATYPE(CGROUP_UDP4_RECVMSG); + CGROUP_ATYPE(CGROUP_UDP6_RECVMSG); + CGROUP_ATYPE(CGROUP_GETSOCKOPT); + CGROUP_ATYPE(CGROUP_SETSOCKOPT); + CGROUP_ATYPE(CGROUP_INET4_GETPEERNAME); + CGROUP_ATYPE(CGROUP_INET6_GETPEERNAME); + CGROUP_ATYPE(CGROUP_INET4_GETSOCKNAME); + CGROUP_ATYPE(CGROUP_INET6_GETSOCKNAME); + CGROUP_ATYPE(CGROUP_INET_SOCK_RELEASE); + default: + return CGROUP_BPF_ATTACH_TYPE_INVALID; + } +} + +#undef CGROUP_ATYPE
-extern struct static_key_false cgroup_bpf_enabled_key[MAX_BPF_ATTACH_TYPE]; -#define cgroup_bpf_enabled(type) static_branch_unlikely(&cgroup_bpf_enabled_key[type]) +extern struct static_key_false cgroup_bpf_enabled_key[MAX_CGROUP_BPF_ATTACH_TYPE]; +#define cgroup_bpf_enabled(atype) static_branch_unlikely(&cgroup_bpf_enabled_key[atype])
#define for_each_cgroup_storage_type(stype) \ for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++) @@ -68,15 +132,15 @@ struct bpf_prog_array;
struct cgroup_bpf { /* array of effective progs in this cgroup */ - struct bpf_prog_array __rcu *effective[MAX_BPF_ATTACH_TYPE]; + struct bpf_prog_array __rcu *effective[MAX_CGROUP_BPF_ATTACH_TYPE];
/* attached progs to this cgroup and attach flags * when flags == 0 or BPF_F_ALLOW_OVERRIDE the progs list will * have either zero or one element * when BPF_F_ALLOW_MULTI the list can have up to BPF_CGROUP_MAX_PROGS */ - struct list_head progs[MAX_BPF_ATTACH_TYPE]; - u32 flags[MAX_BPF_ATTACH_TYPE]; + struct list_head progs[MAX_CGROUP_BPF_ATTACH_TYPE]; + u32 flags[MAX_CGROUP_BPF_ATTACH_TYPE];
/* list of cgroup shared storages */ struct list_head storages; @@ -133,27 +197,27 @@ int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, - enum bpf_attach_type type); + enum cgroup_bpf_attach_type atype);
int __cgroup_bpf_run_filter_sk(struct sock *sk, - enum bpf_attach_type type); + enum cgroup_bpf_attach_type atype);
int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct sockaddr *uaddr, - enum bpf_attach_type type, + enum cgroup_bpf_attach_type atype, void *t_ctx);
int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct bpf_sock_ops_kern *sock_ops, - enum bpf_attach_type type); + enum cgroup_bpf_attach_type atype);
int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, - short access, enum bpf_attach_type type); + short access, enum cgroup_bpf_attach_type atype);
int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, char **buf, size_t *pcount, loff_t *ppos, - enum bpf_attach_type type); + enum cgroup_bpf_attach_type atype);
int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level, int *optname, char __user *optval, @@ -192,9 +256,9 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_INET_INGRESS)) \ + if (cgroup_bpf_enabled(CGROUP_INET_INGRESS)) \ __ret = __cgroup_bpf_run_filter_skb(sk, skb, \ - BPF_CGROUP_INET_INGRESS); \ + CGROUP_INET_INGRESS); \ \ __ret; \ }) @@ -202,51 +266,51 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_INET_EGRESS) && sk && sk == skb->sk) { \ + if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk && sk == skb->sk) { \ typeof(sk) __sk = sk_to_full_sk(sk); \ if (sk_fullsock(__sk)) \ __ret = __cgroup_bpf_run_filter_skb(__sk, skb, \ - BPF_CGROUP_INET_EGRESS); \ + CGROUP_INET_EGRESS); \ } \ __ret; \ })
-#define BPF_CGROUP_RUN_SK_PROG(sk, type) \ +#define BPF_CGROUP_RUN_SK_PROG(sk, atype) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(type)) { \ - __ret = __cgroup_bpf_run_filter_sk(sk, type); \ + if (cgroup_bpf_enabled(atype)) { \ + __ret = __cgroup_bpf_run_filter_sk(sk, atype); \ } \ __ret; \ })
#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \ - BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_CREATE) + BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET_SOCK_CREATE)
#define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) \ - BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_RELEASE) + BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET_SOCK_RELEASE)
#define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) \ - BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET4_POST_BIND) + BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET4_POST_BIND)
#define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) \ - BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET6_POST_BIND) + BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET6_POST_BIND)
-#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type) \ +#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, atype) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(type)) \ - __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ + if (cgroup_bpf_enabled(atype)) \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype, \ NULL); \ __ret; \ })
-#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) \ +#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(type)) { \ + if (cgroup_bpf_enabled(atype)) { \ lock_sock(sk); \ - __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype, \ t_ctx); \ release_sock(sk); \ } \ @@ -254,39 +318,39 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, })
#define BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_BIND, NULL) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_INET4_BIND, NULL)
#define BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_BIND, NULL) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_INET6_BIND, NULL)
#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) \ - ((cgroup_bpf_enabled(BPF_CGROUP_INET4_CONNECT) || \ - cgroup_bpf_enabled(BPF_CGROUP_INET6_CONNECT)) && \ + ((cgroup_bpf_enabled(CGROUP_INET4_CONNECT) || \ + cgroup_bpf_enabled(CGROUP_INET6_CONNECT)) && \ (sk)->sk_prot->pre_connect)
#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_CONNECT) + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, CGROUP_INET4_CONNECT)
#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_CONNECT) + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, CGROUP_INET6_CONNECT)
#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT, NULL) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_INET4_CONNECT, NULL)
#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT, NULL) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_INET6_CONNECT, NULL)
#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_SENDMSG, t_ctx) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP4_SENDMSG, t_ctx)
#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_SENDMSG, t_ctx) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP6_SENDMSG, t_ctx)
#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_RECVMSG, NULL) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP4_RECVMSG, NULL)
#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_RECVMSG, NULL) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP6_RECVMSG, NULL)
/* The SOCK_OPS"_SK" macro should be used when sock_ops->sk is not a * fullsock and its parent fullsock cannot be traced by @@ -306,33 +370,33 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(sock_ops, sk) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_SOCK_OPS)) \ + if (cgroup_bpf_enabled(CGROUP_SOCK_OPS)) \ __ret = __cgroup_bpf_run_filter_sock_ops(sk, \ sock_ops, \ - BPF_CGROUP_SOCK_OPS); \ + CGROUP_SOCK_OPS); \ __ret; \ })
#define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_SOCK_OPS) && (sock_ops)->sk) { \ + if (cgroup_bpf_enabled(CGROUP_SOCK_OPS) && (sock_ops)->sk) { \ typeof(sk) __sk = sk_to_full_sk((sock_ops)->sk); \ if (__sk && sk_fullsock(__sk)) \ __ret = __cgroup_bpf_run_filter_sock_ops(__sk, \ sock_ops, \ - BPF_CGROUP_SOCK_OPS); \ + CGROUP_SOCK_OPS); \ } \ __ret; \ })
-#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access) \ +#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_DEVICE)) \ - __ret = __cgroup_bpf_check_dev_permission(type, major, minor, \ + if (cgroup_bpf_enabled(CGROUP_DEVICE)) \ + __ret = __cgroup_bpf_check_dev_permission(atype, major, minor, \ access, \ - BPF_CGROUP_DEVICE); \ + CGROUP_DEVICE); \ \ __ret; \ }) @@ -341,10 +405,10 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_SYSCTL)) \ + if (cgroup_bpf_enabled(CGROUP_SYSCTL)) \ __ret = __cgroup_bpf_run_filter_sysctl(head, table, write, \ buf, count, pos, \ - BPF_CGROUP_SYSCTL); \ + CGROUP_SYSCTL); \ __ret; \ })
@@ -352,7 +416,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, kernel_optval) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_SETSOCKOPT)) \ + if (cgroup_bpf_enabled(CGROUP_SETSOCKOPT)) \ __ret = __cgroup_bpf_run_filter_setsockopt(sock, level, \ optname, optval, \ optlen, \ @@ -363,7 +427,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT)) \ + if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT)) \ get_user(__ret, optlen); \ __ret; \ }) @@ -372,7 +436,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, max_optlen, retval) \ ({ \ int __ret = retval; \ - if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT)) \ + if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT)) \ __ret = __cgroup_bpf_run_filter_getsockopt(sock, level, \ optname, optval, \ optlen, max_optlen, \ @@ -434,8 +498,8 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, return 0; }
-#define cgroup_bpf_enabled(type) (0) -#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) ({ 0; }) +#define cgroup_bpf_enabled(atype) (0) +#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx) ({ 0; }) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) @@ -454,7 +518,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos) ({ 0; }) #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 762bf87c26a3..00afbbc130ee 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -78,7 +78,7 @@ struct bpf_lpm_trie_key {
struct bpf_cgroup_storage_key { __u64 cgroup_inode_id; /* cgroup inode id */ - __u32 attach_type; /* program attach type */ + __u32 attach_type; /* program attach type (enum bpf_attach_type) */ };
union bpf_iter_link_info { diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 1f6e7df721c3..8f53be0558c1 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -19,7 +19,7 @@
#include "../cgroup/cgroup-internal.h"
-DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_BPF_ATTACH_TYPE); +DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE); EXPORT_SYMBOL(cgroup_bpf_enabled_key);
void cgroup_bpf_offline(struct cgroup *cgrp) @@ -113,12 +113,12 @@ static void cgroup_bpf_release(struct work_struct *work) struct list_head *storages = &cgrp->bpf.storages; struct bpf_cgroup_storage *storage, *stmp;
- unsigned int type; + unsigned int atype;
mutex_lock(&cgroup_mutex);
- for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { - struct list_head *progs = &cgrp->bpf.progs[type]; + for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) { + struct list_head *progs = &cgrp->bpf.progs[atype]; struct bpf_prog_list *pl, *pltmp;
list_for_each_entry_safe(pl, pltmp, progs, node) { @@ -128,10 +128,10 @@ static void cgroup_bpf_release(struct work_struct *work) if (pl->link) bpf_cgroup_link_auto_detach(pl->link); kfree(pl); - static_branch_dec(&cgroup_bpf_enabled_key[type]); + static_branch_dec(&cgroup_bpf_enabled_key[atype]); } old_array = rcu_dereference_protected( - cgrp->bpf.effective[type], + cgrp->bpf.effective[atype], lockdep_is_held(&cgroup_mutex)); bpf_prog_array_free(old_array); } @@ -196,7 +196,7 @@ static u32 prog_list_length(struct list_head *head) * if parent has overridable or multi-prog, allow attaching */ static bool hierarchy_allows_attach(struct cgroup *cgrp, - enum bpf_attach_type type) + enum cgroup_bpf_attach_type atype) { struct cgroup *p;
@@ -204,12 +204,12 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp, if (!p) return true; do { - u32 flags = p->bpf.flags[type]; + u32 flags = p->bpf.flags[atype]; u32 cnt;
if (flags & BPF_F_ALLOW_MULTI) return true; - cnt = prog_list_length(&p->bpf.progs[type]); + cnt = prog_list_length(&p->bpf.progs[atype]); WARN_ON_ONCE(cnt > 1); if (cnt == 1) return !!(flags & BPF_F_ALLOW_OVERRIDE); @@ -225,7 +225,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp, * to programs in this cgroup */ static int compute_effective_progs(struct cgroup *cgrp, - enum bpf_attach_type type, + enum cgroup_bpf_attach_type atype, struct bpf_prog_array **array) { struct bpf_prog_array_item *item; @@ -236,8 +236,8 @@ static int compute_effective_progs(struct cgroup *cgrp,
/* count number of effective programs by walking parents */ do { - if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) - cnt += prog_list_length(&p->bpf.progs[type]); + if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) + cnt += prog_list_length(&p->bpf.progs[atype]); p = cgroup_parent(p); } while (p);
@@ -249,10 +249,10 @@ static int compute_effective_progs(struct cgroup *cgrp, cnt = 0; p = cgrp; do { - if (cnt > 0 && !(p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) + if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) continue;
- list_for_each_entry(pl, &p->bpf.progs[type], node) { + list_for_each_entry(pl, &p->bpf.progs[atype], node) { if (!prog_list_prog(pl)) continue;
@@ -269,10 +269,10 @@ static int compute_effective_progs(struct cgroup *cgrp, }
static void activate_effective_progs(struct cgroup *cgrp, - enum bpf_attach_type type, + enum cgroup_bpf_attach_type atype, struct bpf_prog_array *old_array) { - old_array = rcu_replace_pointer(cgrp->bpf.effective[type], old_array, + old_array = rcu_replace_pointer(cgrp->bpf.effective[atype], old_array, lockdep_is_held(&cgroup_mutex)); /* free prog array after grace period, since __cgroup_bpf_run_*() * might be still walking the array @@ -328,7 +328,7 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) }
static int update_effective_progs(struct cgroup *cgrp, - enum bpf_attach_type type) + enum cgroup_bpf_attach_type atype) { struct cgroup_subsys_state *css; int err; @@ -340,7 +340,7 @@ static int update_effective_progs(struct cgroup *cgrp, if (percpu_ref_is_zero(&desc->bpf.refcnt)) continue;
- err = compute_effective_progs(desc, type, &desc->bpf.inactive); + err = compute_effective_progs(desc, atype, &desc->bpf.inactive); if (err) goto cleanup; } @@ -357,7 +357,7 @@ static int update_effective_progs(struct cgroup *cgrp, continue; }
- activate_effective_progs(desc, type, desc->bpf.inactive); + activate_effective_progs(desc, atype, desc->bpf.inactive); desc->bpf.inactive = NULL; }
@@ -436,11 +436,12 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, enum bpf_attach_type type, u32 flags) { u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)); - struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; + enum cgroup_bpf_attach_type atype; struct bpf_prog_list *pl; + struct list_head *progs; int err;
if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) || @@ -454,10 +455,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, /* replace_prog implies BPF_F_REPLACE, and vice versa */ return -EINVAL;
- if (!hierarchy_allows_attach(cgrp, type)) + atype = to_cgroup_bpf_attach_type(type); + if (atype < 0) + return -EINVAL; + + progs = &cgrp->bpf.progs[atype]; + + if (!hierarchy_allows_attach(cgrp, atype)) return -EPERM;
- if (!list_empty(progs) && cgrp->bpf.flags[type] != saved_flags) + if (!list_empty(progs) && cgrp->bpf.flags[atype] != saved_flags) /* Disallow attaching non-overridable on top * of existing overridable in this cgroup. * Disallow attaching multi-prog if overridable or none @@ -490,16 +497,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, pl->prog = prog; pl->link = link; bpf_cgroup_storages_assign(pl->storage, storage); - cgrp->bpf.flags[type] = saved_flags; + cgrp->bpf.flags[atype] = saved_flags;
- err = update_effective_progs(cgrp, type); + err = update_effective_progs(cgrp, atype); if (err) goto cleanup;
if (old_prog) bpf_prog_put(old_prog); else - static_branch_inc(&cgroup_bpf_enabled_key[type]); + static_branch_inc(&cgroup_bpf_enabled_key[atype]); bpf_cgroup_storages_link(new_storage, cgrp, type); return 0;
@@ -520,7 +527,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, * all descendant cgroups. This function is guaranteed to succeed. */ static void replace_effective_prog(struct cgroup *cgrp, - enum bpf_attach_type type, + enum cgroup_bpf_attach_type atype, struct bpf_cgroup_link *link) { struct bpf_prog_array_item *item; @@ -539,10 +546,10 @@ static void replace_effective_prog(struct cgroup *cgrp,
/* find position of link in effective progs array */ for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) { - if (pos && !(cg->bpf.flags[type] & BPF_F_ALLOW_MULTI)) + if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) continue;
- head = &cg->bpf.progs[type]; + head = &cg->bpf.progs[atype]; list_for_each_entry(pl, head, node) { if (!prog_list_prog(pl)) continue; @@ -554,7 +561,7 @@ static void replace_effective_prog(struct cgroup *cgrp, found: BUG_ON(!cg); progs = rcu_dereference_protected( - desc->bpf.effective[type], + desc->bpf.effective[atype], lockdep_is_held(&cgroup_mutex)); item = &progs->items[pos]; WRITE_ONCE(item->prog, link->link.prog); @@ -574,11 +581,18 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link, struct bpf_prog *new_prog) { - struct list_head *progs = &cgrp->bpf.progs[link->type]; + enum cgroup_bpf_attach_type atype; struct bpf_prog *old_prog; struct bpf_prog_list *pl; + struct list_head *progs; bool found = false;
+ atype = to_cgroup_bpf_attach_type(link->type); + if (atype < 0) + return -EINVAL; + + progs = &cgrp->bpf.progs[atype]; + if (link->link.prog->type != new_prog->type) return -EINVAL;
@@ -592,7 +606,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp, return -ENOENT;
old_prog = xchg(&link->link.prog, new_prog); - replace_effective_prog(cgrp, link->type, link); + replace_effective_prog(cgrp, atype, link); bpf_prog_put(old_prog); return 0; } @@ -667,12 +681,20 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs, int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_cgroup_link *link, enum bpf_attach_type type) { - struct list_head *progs = &cgrp->bpf.progs[type]; - u32 flags = cgrp->bpf.flags[type]; - struct bpf_prog_list *pl; + enum cgroup_bpf_attach_type atype; struct bpf_prog *old_prog; + struct bpf_prog_list *pl; + struct list_head *progs; + u32 flags; int err;
+ atype = to_cgroup_bpf_attach_type(type); + if (atype < 0) + return -EINVAL; + + progs = &cgrp->bpf.progs[atype]; + flags = cgrp->bpf.flags[atype]; + if (prog && link) /* only one of prog or link can be specified */ return -EINVAL; @@ -686,7 +708,7 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, pl->prog = NULL; pl->link = NULL;
- err = update_effective_progs(cgrp, type); + err = update_effective_progs(cgrp, atype); if (err) goto cleanup;
@@ -695,10 +717,10 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, kfree(pl); if (list_empty(progs)) /* last program was detached, reset flags to zero */ - cgrp->bpf.flags[type] = 0; + cgrp->bpf.flags[atype] = 0; if (old_prog) bpf_prog_put(old_prog); - static_branch_dec(&cgroup_bpf_enabled_key[type]); + static_branch_dec(&cgroup_bpf_enabled_key[atype]); return 0;
cleanup: @@ -714,13 +736,21 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, { __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); enum bpf_attach_type type = attr->query.attach_type; - struct list_head *progs = &cgrp->bpf.progs[type]; - u32 flags = cgrp->bpf.flags[type]; + enum cgroup_bpf_attach_type atype; struct bpf_prog_array *effective; + struct list_head *progs; struct bpf_prog *prog; int cnt, ret = 0, i; + u32 flags; + + atype = to_cgroup_bpf_attach_type(type); + if (atype < 0) + return -EINVAL; + + progs = &cgrp->bpf.progs[atype]; + flags = cgrp->bpf.flags[atype];
- effective = rcu_dereference_protected(cgrp->bpf.effective[type], + effective = rcu_dereference_protected(cgrp->bpf.effective[atype], lockdep_is_held(&cgroup_mutex));
if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) @@ -925,14 +955,14 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) link->cgroup = cgrp; link->type = attr->link_create.attach_type;
- err = bpf_link_prime(&link->link, &link_primer); + err = bpf_link_prime(&link->link, &link_primer); if (err) { kfree(link); goto out_put_cgroup; }
- err = cgroup_bpf_attach(cgrp, NULL, NULL, link, link->type, - BPF_F_ALLOW_MULTI); + err = cgroup_bpf_attach(cgrp, NULL, NULL, link, + link->type, BPF_F_ALLOW_MULTI); if (err) { bpf_link_cleanup(&link_primer); goto out_put_cgroup; @@ -986,7 +1016,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, */ int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, - enum bpf_attach_type type) + enum cgroup_bpf_attach_type atype) { unsigned int offset = skb->data - skb_network_header(skb); struct sock *save_sk; @@ -1008,11 +1038,11 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, /* compute pointers for the bpf prog */ bpf_compute_and_save_data_end(skb, &saved_data_end);
- if (type == BPF_CGROUP_INET_EGRESS) { + if (atype == CGROUP_INET_EGRESS) { ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY( - cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb); + cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb); } else { - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb); ret = (ret == 1 ? 0 : -EPERM); } @@ -1038,12 +1068,12 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); * and if it returned != 1 during execution. In all other cases, 0 is returned. */ int __cgroup_bpf_run_filter_sk(struct sock *sk, - enum bpf_attach_type type) + enum cgroup_bpf_attach_type atype) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); int ret;
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[atype], sk, BPF_PROG_RUN); return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); @@ -1063,7 +1093,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); */ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct sockaddr *uaddr, - enum bpf_attach_type type, + enum cgroup_bpf_attach_type atype, void *t_ctx) { struct bpf_sock_addr_kern ctx = { @@ -1087,7 +1117,7 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, }
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[atype], &ctx, BPF_PROG_RUN);
return ret == 1 ? 0 : -EPERM; } @@ -1111,19 +1141,19 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); */ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct bpf_sock_ops_kern *sock_ops, - enum bpf_attach_type type) + enum cgroup_bpf_attach_type atype) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); int ret;
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops, + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[atype], sock_ops, BPF_PROG_RUN); return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, - short access, enum bpf_attach_type type) + short access, enum cgroup_bpf_attach_type atype) { struct cgroup *cgrp; struct bpf_cgroup_dev_ctx ctx = { @@ -1135,7 +1165,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
rcu_read_lock(); cgrp = task_dfl_cgroup(current); - allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, + allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[atype], &ctx, BPF_PROG_RUN); rcu_read_unlock();
@@ -1227,7 +1257,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, char **buf, size_t *pcount, loff_t *ppos, - enum bpf_attach_type type) + enum cgroup_bpf_attach_type atype) { struct bpf_sysctl_kern ctx = { .head = head, @@ -1267,7 +1297,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
rcu_read_lock(); cgrp = task_dfl_cgroup(current); - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[atype], &ctx, BPF_PROG_RUN); rcu_read_unlock();
kfree(ctx.cur_val); @@ -1285,7 +1315,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
#ifdef CONFIG_NET static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, - enum bpf_attach_type attach_type) + enum cgroup_bpf_attach_type attach_type) { struct bpf_prog_array *prog_array; bool empty; @@ -1360,7 +1390,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, * attached to the hook so we don't waste time allocating * memory and locking the socket. */ - if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) + if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_SETSOCKOPT)) return 0;
/* Allocate a bit more than the initial user buffer for @@ -1381,7 +1411,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, }
lock_sock(sk); - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT], + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[CGROUP_SETSOCKOPT], &ctx, BPF_PROG_RUN); release_sock(sk);
@@ -1456,7 +1486,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, * attached to the hook so we don't waste time allocating * memory and locking the socket. */ - if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) + if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_GETSOCKOPT)) return retval;
ctx.optlen = max_optlen; @@ -1491,7 +1521,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, }
lock_sock(sk); - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[CGROUP_GETSOCKOPT], &ctx, BPF_PROG_RUN); release_sock(sk);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f3fb7a380914..913adf59785a 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -778,7 +778,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin_port = inet->inet_dport; sin->sin_addr.s_addr = inet->inet_daddr; BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - BPF_CGROUP_INET4_GETPEERNAME, + CGROUP_INET4_GETPEERNAME, NULL); } else { __be32 addr = inet->inet_rcv_saddr; @@ -787,7 +787,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin_port = inet->inet_sport; sin->sin_addr.s_addr = addr; BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - BPF_CGROUP_INET4_GETSOCKNAME, + CGROUP_INET4_GETSOCKNAME, NULL); } memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 2884d140764d..c3b414aeb6f9 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1126,7 +1126,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_unlock(); }
- if (cgroup_bpf_enabled(BPF_CGROUP_UDP4_SENDMSG) && !connected) { + if (cgroup_bpf_enabled(CGROUP_UDP4_SENDMSG) && !connected) { err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, (struct sockaddr *)usin, &ipc.addr); if (err) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index fc985658dc91..b5883538cd92 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -528,7 +528,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, if (np->sndflow) sin->sin6_flowinfo = np->flow_label; BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - BPF_CGROUP_INET6_GETPEERNAME, + CGROUP_INET6_GETPEERNAME, NULL); } else { if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) @@ -537,7 +537,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin6_addr = sk->sk_v6_rcv_saddr; sin->sin6_port = inet->inet_sport; BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - BPF_CGROUP_INET6_GETSOCKNAME, + CGROUP_INET6_GETSOCKNAME, NULL); } sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 6e9c1a50afce..a79c356c59a1 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1460,7 +1460,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.saddr = np->saddr; fl6.fl6_sport = inet->inet_sport;
- if (cgroup_bpf_enabled(BPF_CGROUP_UDP6_SENDMSG) && !connected) { + if (cgroup_bpf_enabled(CGROUP_UDP6_SENDMSG) && !connected) { err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, (struct sockaddr *)sin6, &fl6.saddr); if (err) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 762bf87c26a3..00afbbc130ee 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -78,7 +78,7 @@ struct bpf_lpm_trie_key {
struct bpf_cgroup_storage_key { __u64 cgroup_inode_id; /* cgroup inode id */ - __u32 attach_type; /* program attach type */ + __u32 attach_type; /* program attach type (enum bpf_attach_type) */ };
union bpf_iter_link_info {
From: Lu Jialin lujialin4@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4GII8?from=project-issue CVE: NA
--------
We reserve some fields beforehand for cgroup_bpf_attach_type and bpf_cgroup_storage_type prone to change, therefore, we can hot add/change features of bpf cgroup with this enhancement.
After reserving, normally cache does not matter as the reserved fields are not accessed at all.
Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: weiyang wang wangweiyang2@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Reviewed-by: Wei Yongjunweiyongjun1@huawei.com Reviewed-by: weiyang wang wangweiyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/bpf-cgroup.h | 8 ++++++++ include/linux/bpf.h | 8 ++++++++ 2 files changed, 16 insertions(+)
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index cd73f5cae523..bf74485b6567 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -49,6 +49,14 @@ enum cgroup_bpf_attach_type { CGROUP_INET4_GETSOCKNAME, CGROUP_INET6_GETSOCKNAME, CGROUP_INET_SOCK_RELEASE, + CGROUP_ATTACH_TYPE_KABI_RESERVE_1, + CGROUP_ATTACH_TYPE_KABI_RESERVE_2, + CGROUP_ATTACH_TYPE_KABI_RESERVE_3, + CGROUP_ATTACH_TYPE_KABI_RESERVE_4, + CGROUP_ATTACH_TYPE_KABI_RESERVE_5, + CGROUP_ATTACH_TYPE_KABI_RESERVE_6, + CGROUP_ATTACH_TYPE_KABI_RESERVE_7, + CGROUP_ATTACH_TYPE_KABI_RESERVE_8, MAX_CGROUP_BPF_ATTACH_TYPE };
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 88245386a4b6..0cbfdd423553 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -491,6 +491,14 @@ struct bpf_prog_offload { enum bpf_cgroup_storage_type { BPF_CGROUP_STORAGE_SHARED, BPF_CGROUP_STORAGE_PERCPU, + BPF_CGROUP_STORAGE_KABI_RESERVE_1, + BPF_CGROUP_STORAGE_KABI_RESERVE_2, + BPF_CGROUP_STORAGE_KABI_RESERVE_3, + BPF_CGROUP_STORAGE_KABI_RESERVE_4, + BPF_CGROUP_STORAGE_KABI_RESERVE_5, + BPF_CGROUP_STORAGE_KABI_RESERVE_6, + BPF_CGROUP_STORAGE_KABI_RESERVE_7, + BPF_CGROUP_STORAGE_KABI_RESERVE_8, __BPF_CGROUP_STORAGE_MAX };
From: Peng Liang liangpeng10@huawei.com
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4OELI CVE: NA
-------------------------------------------------
24 page flags are used in 32-bit architectures and 27 page flags are used in 64-bit ones currently. And 23 gfp flags are used currently. Add 2 reserved page and gfp flags for internal extension. For the new flags which backported from kernel upstream, place them behind the reserved flags.
Signed-off-by: Peng Liang liangpeng10@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/gfp.h | 10 ++++++++-- include/linux/page-flags.h | 8 ++++++++ include/trace/events/mmflags.h | 8 ++++++-- tools/perf/builtin-kmem.c | 2 ++ 4 files changed, 24 insertions(+), 4 deletions(-)
diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 22fc743ddf23..80efbea0c9d7 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -39,8 +39,10 @@ struct vm_area_struct; #define ___GFP_HARDWALL 0x100000u #define ___GFP_THISNODE 0x200000u #define ___GFP_ACCOUNT 0x400000u +#define ___GFP_RESERVE_0 0x800000u +#define ___GFP_RESERVE_1 0x1000000u #ifdef CONFIG_LOCKDEP -#define ___GFP_NOLOCKDEP 0x800000u +#define ___GFP_NOLOCKDEP 0x2000000u #else #define ___GFP_NOLOCKDEP 0 #endif @@ -223,8 +225,12 @@ struct vm_area_struct; /* Disable lockdep for GFP context tracking */ #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
+/* Reserve 2 flags for future usage */ +#define __GFP_RESERVE_0 ((__force gfp_t)___GFP_RESERVE_0) +#define __GFP_RESERVE_1 ((__force gfp_t)___GFP_RESERVE_1) + /* Room for N __GFP_FOO bits */ -#define __GFP_BITS_SHIFT (23 + IS_ENABLED(CONFIG_LOCKDEP)) +#define __GFP_BITS_SHIFT (25 + IS_ENABLED(CONFIG_LOCKDEP)) #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
/** diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 56d9a03119a3..18dbfa2a7c5f 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -139,6 +139,14 @@ enum pageflags { #ifdef CONFIG_64BIT PG_arch_2, #endif + + /* Add reserved page flags for internal extension. For the new page + * flags which backported from kernel upstream, please place them + * behind the reserved page flags. + */ + PG_reserve_pgflag_0, + PG_reserve_pgflag_1, + __NR_PAGEFLAGS,
/* Filesystems */ diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 792e0a2fa775..30700ccb1eea 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -48,7 +48,9 @@ {(unsigned long)__GFP_WRITE, "__GFP_WRITE"}, \ {(unsigned long)__GFP_RECLAIM, "__GFP_RECLAIM"}, \ {(unsigned long)__GFP_DIRECT_RECLAIM, "__GFP_DIRECT_RECLAIM"},\ - {(unsigned long)__GFP_KSWAPD_RECLAIM, "__GFP_KSWAPD_RECLAIM"}\ + {(unsigned long)__GFP_KSWAPD_RECLAIM, "__GFP_KSWAPD_RECLAIM"},\ + {(unsigned long)__GFP_RESERVE_0, "__GFP_RESERVE_0"}, \ + {(unsigned long)__GFP_RESERVE_1, "__GFP_RESERVE_1"} \
#define show_gfp_flags(flags) \ (flags) ? __print_flags(flags, "|", \ @@ -112,7 +114,9 @@ IF_HAVE_PG_UNCACHED(PG_uncached, "uncached" ) \ IF_HAVE_PG_HWPOISON(PG_hwpoison, "hwpoison" ) \ IF_HAVE_PG_IDLE(PG_young, "young" ) \ IF_HAVE_PG_IDLE(PG_idle, "idle" ) \ -IF_HAVE_PG_ARCH_2(PG_arch_2, "arch_2" ) +IF_HAVE_PG_ARCH_2(PG_arch_2, "arch_2" ), \ + {1UL << PG_reserve_pgflag_0, "reserve_pgflag_0"}, \ + {1UL << PG_reserve_pgflag_1, "reserve_pgflag_1"}
#define show_page_flags(flags) \ (flags) ? __print_flags(flags, "|", \ diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index a50dae2c4ae9..ffe9cf4160cf 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -660,6 +660,8 @@ static const struct { { "__GFP_RECLAIM", "R" }, { "__GFP_DIRECT_RECLAIM", "DR" }, { "__GFP_KSWAPD_RECLAIM", "KR" }, + { "__GFP_RESERVE_0", "RE0" }, + { "__GFP_RESERVE_1", "RE1" }, };
static size_t max_gfp_len;
From: Lu Jialin lujialin4@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4IMAK?from=project-issue CVE: NA
--------
Export memcg.min and memcg.low from cgroupv2 to cgroupv1, in order to reduce the negtive impact between cgroups when the system memory is insufficient.
Only export memory.{min/low} numbers in mem_cgroup_legacy_files and move related functions in front of mem_cgroup_legacy_files. There is no need to other changes.
Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: weiyang wang wangweiyang2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/memcontrol.c | 124 ++++++++++++++++++++++++++---------------------- 1 file changed, 68 insertions(+), 56 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 99cfd840e3bd..e0efdbbddd80 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5013,6 +5013,62 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, return ret; }
+static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) +{ + if (value == PAGE_COUNTER_MAX) + seq_puts(m, "max\n"); + else + seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE); + + return 0; +} + +static int memory_min_show(struct seq_file *m, void *v) +{ + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->memory.min)); +} + +static ssize_t memory_min_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long min; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &min); + if (err) + return err; + + page_counter_set_min(&memcg->memory, min); + + return nbytes; +} + +static int memory_low_show(struct seq_file *m, void *v) +{ + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->memory.low)); +} + +static ssize_t memory_low_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long low; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &low); + if (err) + return err; + + page_counter_set_low(&memcg->memory, low); + + return nbytes; +} + static struct cftype mem_cgroup_legacy_files[] = { { .name = "usage_in_bytes", @@ -5146,6 +5202,18 @@ static struct cftype mem_cgroup_legacy_files[] = { .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, + { + .name = "min", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_min_show, + .write = memory_min_write, + }, + { + .name = "low", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_low_show, + .write = memory_low_write, + }, { }, /* terminate */ };
@@ -6341,16 +6409,6 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) root_mem_cgroup->use_hierarchy = false; }
-static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) -{ - if (value == PAGE_COUNTER_MAX) - seq_puts(m, "max\n"); - else - seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE); - - return 0; -} - static u64 memory_current_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -6359,52 +6417,6 @@ static u64 memory_current_read(struct cgroup_subsys_state *css, return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; }
-static int memory_min_show(struct seq_file *m, void *v) -{ - return seq_puts_memcg_tunable(m, - READ_ONCE(mem_cgroup_from_seq(m)->memory.min)); -} - -static ssize_t memory_min_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); - unsigned long min; - int err; - - buf = strstrip(buf); - err = page_counter_memparse(buf, "max", &min); - if (err) - return err; - - page_counter_set_min(&memcg->memory, min); - - return nbytes; -} - -static int memory_low_show(struct seq_file *m, void *v) -{ - return seq_puts_memcg_tunable(m, - READ_ONCE(mem_cgroup_from_seq(m)->memory.low)); -} - -static ssize_t memory_low_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); - unsigned long low; - int err; - - buf = strstrip(buf); - err = page_counter_memparse(buf, "max", &low); - if (err) - return err; - - page_counter_set_low(&memcg->memory, low); - - return nbytes; -} - static int memory_high_show(struct seq_file *m, void *v) { return seq_puts_memcg_tunable(m,
From: Lu Jialin lujialin4@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4IMAK?from=project-issue CVE: NA
--------
Export memory.high from cgroupv2 to cgroupv1. Therefore, when the usage of the memcg is larger than memory.high, some pages will be reclaimed before return to userland, which will throttle the process.
Only export memory.high number in mem_cgroup_legacy_files and move related functions in front of mem_cgroup_legacy_files. There is no need to other changes.
Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: weiyang wang wangweiyang2@huawei.com Reviewed-by: weiyang wang wangweiyang2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/memcontrol.c | 104 +++++++++++++++++++++++++----------------------- 1 file changed, 55 insertions(+), 49 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e0efdbbddd80..f93bfe1659a3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5069,6 +5069,55 @@ static ssize_t memory_low_write(struct kernfs_open_file *of, return nbytes; }
+static int memory_high_show(struct seq_file *m, void *v) +{ + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->memory.high)); +} + +static ssize_t memory_high_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned int nr_retries = MAX_RECLAIM_RETRIES; + bool drained = false; + unsigned long high; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &high); + if (err) + return err; + + page_counter_set_high(&memcg->memory, high); + + for (;;) { + unsigned long nr_pages = page_counter_read(&memcg->memory); + unsigned long reclaimed; + + if (nr_pages <= high) + break; + + if (signal_pending(current)) + break; + + if (!drained) { + drain_all_stock(memcg); + drained = true; + continue; + } + + reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, + GFP_KERNEL, true); + + if (!reclaimed && !nr_retries--) + break; + } + + memcg_wb_domain_size_changed(memcg); + return nbytes; +} + static struct cftype mem_cgroup_legacy_files[] = { { .name = "usage_in_bytes", @@ -5214,6 +5263,12 @@ static struct cftype mem_cgroup_legacy_files[] = { .seq_show = memory_low_show, .write = memory_low_write, }, + { + .name = "high", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_high_show, + .write = memory_high_write, + }, { }, /* terminate */ };
@@ -6417,55 +6472,6 @@ static u64 memory_current_read(struct cgroup_subsys_state *css, return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; }
-static int memory_high_show(struct seq_file *m, void *v) -{ - return seq_puts_memcg_tunable(m, - READ_ONCE(mem_cgroup_from_seq(m)->memory.high)); -} - -static ssize_t memory_high_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); - unsigned int nr_retries = MAX_RECLAIM_RETRIES; - bool drained = false; - unsigned long high; - int err; - - buf = strstrip(buf); - err = page_counter_memparse(buf, "max", &high); - if (err) - return err; - - page_counter_set_high(&memcg->memory, high); - - for (;;) { - unsigned long nr_pages = page_counter_read(&memcg->memory); - unsigned long reclaimed; - - if (nr_pages <= high) - break; - - if (signal_pending(current)) - break; - - if (!drained) { - drain_all_stock(memcg); - drained = true; - continue; - } - - reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, - GFP_KERNEL, true); - - if (!reclaimed && !nr_retries--) - break; - } - - memcg_wb_domain_size_changed(memcg); - return nbytes; -} - static int memory_max_show(struct seq_file *m, void *v) { return seq_puts_memcg_tunable(m,
From: Lu Jialin lujialin4@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4IMAK?from=project-issue CVE: NA
--------
Since memory.high reclaim is sync whether is in interrupt, it could do more work than direct reclaim, i.e. write out dirty page, etc.
So, add PF_KSWAPD flag, so that current_is_kswapd() would return true for memcg kswapd.
Memcg kswapd should stop when usage of memcg fit the memcg kswapd stop flag. When the userland sets the memcg->memory.max, the stop_flag is (memcg->memory.high - memcg->memory.max * 10 / 1000), which is similar with global kswapd. Otherwise, the stop_flag is (memcg->memory.high - memcg->memory.high / 6), which is similar with most difference between watermark_low and watermark_high.
And, memcg kswapd should not break memory.low protection for now.
Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Reviewed-by: weiyang wang wangweiyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/memcontrol.c | 4 ++++ mm/vmscan.c | 40 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 42 insertions(+), 2 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f93bfe1659a3..a3cf9c074cfa 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2364,8 +2364,10 @@ static void high_work_func(struct work_struct *work) { struct mem_cgroup *memcg;
+ current->flags |= PF_SWAPWRITE | PF_MEMALLOC | PF_KSWAPD; memcg = container_of(work, struct mem_cgroup, high_work); reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); + current->flags &= ~(PF_SWAPWRITE | PF_MEMALLOC | PF_KSWAPD); }
/* @@ -2535,9 +2537,11 @@ void mem_cgroup_handle_over_high(void) * memory.high is currently batched, whereas memory.max and the page * allocator run every time an allocation is made. */ + current->flags |= PF_SWAPWRITE | PF_MEMALLOC | PF_KSWAPD; nr_reclaimed = reclaim_high(memcg, in_retry ? SWAP_CLUSTER_MAX : nr_pages, GFP_KERNEL); + current->flags &= ~(PF_SWAPWRITE | PF_MEMALLOC | PF_KSWAPD);
/* * memory.high is breached and reclaim is unable to keep up. Throttle diff --git a/mm/vmscan.c b/mm/vmscan.c index c851e5f91842..e1e44f0c486d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -61,6 +61,8 @@
#include "internal.h"
+#define MEMCG_KSWAPD_SCATOR 10 + #define CREATE_TRACE_POINTS #include <trace/events/vmscan.h>
@@ -2834,6 +2836,24 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, return inactive_lru_pages > pages_for_compaction; }
+static bool is_memcg_kswapd_stopped(struct scan_control *sc) +{ + struct mem_cgroup *memcg = sc->target_mem_cgroup; + bool is_stop = false; + unsigned long stop_flag = 0; + + if (!cgroup_reclaim(sc)) + return false; + if (memcg->memory.max == PAGE_COUNTER_MAX) + stop_flag = memcg->memory.high / 6; + else + stop_flag = memcg->memory.high - memcg->memory.max * + MEMCG_KSWAPD_SCATOR / 1000; + is_stop = page_counter_read(&memcg->memory) < stop_flag; + + return (current_is_kswapd() && is_stop); +} + static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) { struct mem_cgroup *target_memcg = sc->target_mem_cgroup; @@ -2889,6 +2909,14 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) sc->nr_scanned - scanned, sc->nr_reclaimed - reclaimed);
+ /* + * Memcg background reclaim would break iter once memcg kswapd + * flag is satisfied. + */ + if (is_memcg_kswapd_stopped(sc)) { + mem_cgroup_iter_break(target_memcg, memcg); + break; + } } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL))); }
@@ -3257,6 +3285,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
do { + if (is_memcg_kswapd_stopped(sc)) + break; + vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, sc->priority); sc->nr_scanned = 0; @@ -3319,8 +3350,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, goto retry; }
- /* Untapped cgroup reserves? Don't OOM, retry. */ - if (sc->memcg_low_skipped) { + /* + * Untapped cgroup reserves? Don't OOM, retry. + * memcg usage is lower than memory.high / 2, memcg kswapd will lead to + * stop memcg reclaim, but should not break low protection. + */ + if (sc->memcg_low_skipped && + !(current_is_kswapd() && cgroup_reclaim(sc))) { sc->priority = initial_priority; sc->force_deactivate = 0; sc->memcg_low_reclaim = 1;
From: Lu Jialin lujialin4@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4IMAK?from=project-issue CVE: NA
--------
The memcg kswapd could set dirty state to memcg if current scan find all pages are unqueued dirty in the memcg. Then kswapd would write out dirty pages.
Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Reviewed-by: weiyang wang wangweiyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/mmzone.h | 8 ++++---- mm/vmscan.c | 11 +++++++---- 2 files changed, 11 insertions(+), 8 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 21b1def88436..96d4a148648d 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -272,6 +272,10 @@ enum lruvec_flags { LRUVEC_CONGESTED, /* lruvec has many dirty pages * backed by a congested BDI */ + LRUVEC_DIRTY, /* reclaim scanning has recently found + * many dirty file pages at the tail of + * the LRU. + */ };
struct lruvec { @@ -595,10 +599,6 @@ struct zone { } ____cacheline_internodealigned_in_smp;
enum pgdat_flags { - PGDAT_DIRTY, /* reclaim scanning has recently found - * many dirty file pages at the tail - * of the LRU. - */ PGDAT_WRITEBACK, /* reclaim scanning has recently found * many pages under writeback */ diff --git a/mm/vmscan.c b/mm/vmscan.c index e1e44f0c486d..7fec6cf7a0ae 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1289,6 +1289,7 @@ static unsigned int shrink_page_list(struct list_head *page_list, LIST_HEAD(free_pages); unsigned int nr_reclaimed = 0; unsigned int pgactivate = 0; + struct lruvec *target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
memset(stat, 0, sizeof(*stat)); cond_resched(); @@ -1535,7 +1536,7 @@ static unsigned int shrink_page_list(struct list_head *page_list, */ if (page_is_file_lru(page) && (!current_is_kswapd() || !PageReclaim(page) || - !test_bit(PGDAT_DIRTY, &pgdat->flags))) { + !test_bit(LRUVEC_DIRTY, &target_lruvec->flags))) { /* * Immediately reclaim when written back. * Similar in principal to deactivate_page() @@ -3068,7 +3069,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
/* Allow kswapd to start writing pages during reclaim.*/ if (sc->nr.unqueued_dirty == sc->nr.file_taken) - set_bit(PGDAT_DIRTY, &pgdat->flags); + set_bit(LRUVEC_DIRTY, &target_lruvec->flags);
/* * If kswapd scans pages marked for immediate @@ -3088,7 +3089,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) * Legacy memcg will stall in page writeback so avoid forcibly * stalling in wait_iff_congested(). */ - if ((current_is_kswapd() || + if (((current_is_kswapd() && !cgroup_reclaim(sc))|| (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) && sc->nr.dirty && sc->nr.dirty == sc->nr.congested) set_bit(LRUVEC_CONGESTED, &target_lruvec->flags); @@ -3322,6 +3323,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, zone->zone_pgdat); clear_bit(LRUVEC_CONGESTED, &lruvec->flags); + if (current_is_kswapd()) + clear_bit(LRUVEC_DIRTY, &lruvec->flags); } }
@@ -3712,7 +3715,7 @@ static void clear_pgdat_congested(pg_data_t *pgdat) struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
clear_bit(LRUVEC_CONGESTED, &lruvec->flags); - clear_bit(PGDAT_DIRTY, &pgdat->flags); + clear_bit(LRUVEC_DIRTY, &pgdat->flags); clear_bit(PGDAT_WRITEBACK, &pgdat->flags); }
From: Lu Jialin lujialin4@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4IMAK?from=project-issue CVE: NA
--------
This patch adds a default-false static key to disable memcg kswapd feature. User can enable by set memcg_kswapd in cmdline.
Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: weiyang wang wangweiyang2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/memcontrol.h | 3 +++ mm/memcontrol.c | 43 ++++++++++++++++++++++++++++++-------- mm/vmscan.c | 3 ++- 3 files changed, 39 insertions(+), 10 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 46062d99f14d..e5826f1ff337 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1863,6 +1863,9 @@ static inline void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
#endif /* CONFIG_CGROUP_WRITEBACK */
+extern struct static_key_false memcg_kswapd_key; +#define mem_cgroup_kswapd_enabled static_branch_unlikely(&memcg_kswapd_key) + struct sock; bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages); void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a3cf9c074cfa..8acf7ff56294 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -96,6 +96,10 @@ bool cgroup_memory_noswap __read_mostly; static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); #endif
+static bool cgroup_memory_kswapd = false; +DEFINE_STATIC_KEY_FALSE(memcg_kswapd_key); +EXPORT_SYMBOL(memcg_kswapd_key); + /* Whether legacy memory+swap accounting is active */ static bool do_memsw_account(void) { @@ -2364,10 +2368,15 @@ static void high_work_func(struct work_struct *work) { struct mem_cgroup *memcg;
- current->flags |= PF_SWAPWRITE | PF_MEMALLOC | PF_KSWAPD; - memcg = container_of(work, struct mem_cgroup, high_work); - reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); - current->flags &= ~(PF_SWAPWRITE | PF_MEMALLOC | PF_KSWAPD); + if (mem_cgroup_kswapd_enabled) { + current->flags |= PF_SWAPWRITE | PF_MEMALLOC | PF_KSWAPD; + memcg = container_of(work, struct mem_cgroup, high_work); + reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); + current->flags &= ~(PF_SWAPWRITE | PF_MEMALLOC | PF_KSWAPD); + } else { + memcg = container_of(work, struct mem_cgroup, high_work); + reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); + } }
/* @@ -2537,11 +2546,17 @@ void mem_cgroup_handle_over_high(void) * memory.high is currently batched, whereas memory.max and the page * allocator run every time an allocation is made. */ - current->flags |= PF_SWAPWRITE | PF_MEMALLOC | PF_KSWAPD; - nr_reclaimed = reclaim_high(memcg, - in_retry ? SWAP_CLUSTER_MAX : nr_pages, - GFP_KERNEL); - current->flags &= ~(PF_SWAPWRITE | PF_MEMALLOC | PF_KSWAPD); + if (mem_cgroup_kswapd_enabled) { + current->flags |= PF_SWAPWRITE | PF_MEMALLOC | PF_KSWAPD; + nr_reclaimed = reclaim_high(memcg, + in_retry ? SWAP_CLUSTER_MAX : nr_pages, + GFP_KERNEL); + current->flags &= ~(PF_SWAPWRITE | PF_MEMALLOC | PF_KSWAPD); + } else { + nr_reclaimed = reclaim_high(memcg, + in_retry ? SWAP_CLUSTER_MAX : nr_pages, + GFP_KERNEL); + }
/* * memory.high is breached and reclaim is unable to keep up. Throttle @@ -5477,6 +5492,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) struct mem_cgroup *memcg, *old_memcg; long error = -ENOMEM;
+ if (cgroup_memory_kswapd) + static_branch_enable(&memcg_kswapd_key); + old_memcg = set_active_memcg(parent); memcg = mem_cgroup_alloc(); set_active_memcg(old_memcg); @@ -7280,6 +7298,13 @@ static int __init cgroup_memory(char *s) } __setup("cgroup.memory=", cgroup_memory);
+static int __init memcg_kswapd(char *s) +{ + cgroup_memory_kswapd = true; + return 0; +} +__setup("memcg_kswapd", memcg_kswapd); + /* * subsys_initcall() for memory controller. * diff --git a/mm/vmscan.c b/mm/vmscan.c index 7fec6cf7a0ae..5b5cc00b195b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2843,8 +2843,9 @@ static bool is_memcg_kswapd_stopped(struct scan_control *sc) bool is_stop = false; unsigned long stop_flag = 0;
- if (!cgroup_reclaim(sc)) + if (!cgroup_reclaim(sc) || !mem_cgroup_kswapd_enabled) return false; + if (memcg->memory.max == PAGE_COUNTER_MAX) stop_flag = memcg->memory.high / 6; else
From: Weilong Chen chenweilong@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4LGV4 CVE: NA
---------------------------
Patch "cache: Workaround HiSilicon Taishan DC CVAU" breaks the verifiy of cpu capability when hot plug cpus. It set the system scope on but local cpu capability still off. This path fix it by two step: 1. Unset CTR_IDC_SHIFT bit from strict_mask to skip check. 2. Special treatment in read_cpuid_effective_cachetype
Signed-off-by: Weilong Chen chenweilong@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/include/asm/cache.h | 9 +++++++++ arch/arm64/kernel/cpu_errata.c | 1 + 2 files changed, 10 insertions(+)
diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h index 63d43b5f82f6..cf8e78585865 100644 --- a/arch/arm64/include/asm/cache.h +++ b/arch/arm64/include/asm/cache.h @@ -110,6 +110,15 @@ int cache_line_size(void); static inline u32 __attribute_const__ read_cpuid_effective_cachetype(void) { u32 ctr = read_cpuid_cachetype(); +#ifdef CONFIG_HISILICON_ERRATUM_1980005 + static const struct midr_range idc_support_list[] = { + MIDR_ALL_VERSIONS(MIDR_HISI_TSV110), + MIDR_REV(MIDR_HISI_TSV200, 1, 0), + { /* sentinel */ } + }; + if (is_midr_in_range_list(read_cpuid_id(), idc_support_list)) + ctr |= BIT(CTR_IDC_SHIFT); +#endif
if (!(ctr & BIT(CTR_IDC_SHIFT))) { u64 clidr = read_sysreg(clidr_el1); diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index abb6c903abef..f78ce1e6dfa4 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -79,6 +79,7 @@ hisilicon_1980005_enable(const struct arm64_cpu_capabilities *__unused) { cpus_set_cap(ARM64_HAS_CACHE_IDC); arm64_ftr_reg_ctrel0.sys_val |= BIT(CTR_IDC_SHIFT); + arm64_ftr_reg_ctrel0.strict_mask &= ~BIT(CTR_IDC_SHIFT); sysreg_clear_set(sctlr_el1, SCTLR_EL1_UCT, 0); } #endif
From: Yanling Song songyl@ramaxel.com
Ramaxel inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4OENF CVE: NA
Firmware is responsible for the application and release of RSS module ID. And the driver no longer perceives it .
Signed-off-by: Yanling Song songyl@ramaxel.com Reviewed-by: Yang Gan yanggan@ramaxel.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../net/ethernet/ramaxel/spnic/spnic_main.c | 6 -- .../ramaxel/spnic/spnic_mgmt_interface.h | 12 ---- .../ethernet/ramaxel/spnic/spnic_nic_cfg.c | 16 +++--- .../ethernet/ramaxel/spnic/spnic_nic_cfg.h | 16 ------ .../net/ethernet/ramaxel/spnic/spnic_rss.c | 16 +----- .../ethernet/ramaxel/spnic/spnic_rss_cfg.c | 57 ------------------- 6 files changed, 8 insertions(+), 115 deletions(-)
diff --git a/drivers/net/ethernet/ramaxel/spnic/spnic_main.c b/drivers/net/ethernet/ramaxel/spnic/spnic_main.c index f09f11488042..3670d92eaf2a 100644 --- a/drivers/net/ethernet/ramaxel/spnic/spnic_main.c +++ b/drivers/net/ethernet/ramaxel/spnic/spnic_main.c @@ -350,9 +350,6 @@ static void spnic_sw_deinit(struct spnic_nic_dev *nic_dev) spnic_del_mac(nic_dev->hwdev, nic_dev->netdev->dev_addr, 0, sphw_global_func_id(nic_dev->hwdev), SPHW_CHANNEL_NIC);
- if (test_bit(SPNIC_RSS_ENABLE, &nic_dev->flags)) - spnic_rss_template_free(nic_dev->hwdev); - spnic_clear_rss_config(nic_dev); }
@@ -423,9 +420,6 @@ static int spnic_sw_init(struct spnic_nic_dev *nic_dev) set_mac_err: err_mac: get_mac_err: - - if (test_bit(SPNIC_RSS_ENABLE, &nic_dev->flags)) - spnic_rss_template_free(nic_dev->hwdev); spnic_clear_rss_config(nic_dev);
return err; diff --git a/drivers/net/ethernet/ramaxel/spnic/spnic_mgmt_interface.h b/drivers/net/ethernet/ramaxel/spnic/spnic_mgmt_interface.h index 58ab35c18057..720f6fcf5f0a 100644 --- a/drivers/net/ethernet/ramaxel/spnic/spnic_mgmt_interface.h +++ b/drivers/net/ethernet/ramaxel/spnic/spnic_mgmt_interface.h @@ -350,18 +350,6 @@ struct spnic_rss_indir_table { u8 indir[SPNIC_RSS_INDIR_SIZE]; };
-#define NIC_RSS_CMD_TEMP_ALLOC 0x01 -#define NIC_RSS_CMD_TEMP_FREE 0x02 - -struct spnic_rss_template_mgmt { - struct mgmt_msg_head msg_head; - - u16 func_id; - u8 cmd; - u8 template_id; - u8 rsvd1[4]; -}; - #define SPNIC_DCB_UP_MAX 0x8 #define SPNIC_DCB_COS_MAX 0x8 #define SPNIC_DCB_TC_MAX 0x8 diff --git a/drivers/net/ethernet/ramaxel/spnic/spnic_nic_cfg.c b/drivers/net/ethernet/ramaxel/spnic/spnic_nic_cfg.c index 5bde91f35cca..24e55c087da5 100644 --- a/drivers/net/ethernet/ramaxel/spnic/spnic_nic_cfg.c +++ b/drivers/net/ethernet/ramaxel/spnic/spnic_nic_cfg.c @@ -844,6 +844,12 @@ int spnic_init_nic_hwdev(void *hwdev, void *pcidev_hdl, void *dev_hdl, u16 rx_bu goto register_sa_err; }
+ err = spnic_init_function_table(nic_cfg); + if (err) { + nic_err(nic_cfg->dev_hdl, "Failed to init function table\n"); + goto init_func_tbl_err; + } + err = spnic_get_nic_feature(hwdev, &nic_cfg->feature_cap, 1); if (err) { nic_err(nic_cfg->dev_hdl, "Failed to get nic features\n"); @@ -867,22 +873,14 @@ int spnic_init_nic_hwdev(void *hwdev, void *pcidev_hdl, void *dev_hdl, u16 rx_bu
nic_cfg->rx_buff_len = rx_buff_len;
- err = spnic_init_function_table(nic_cfg); - if (err) { - nic_err(nic_cfg->dev_hdl, "Failed to init function table\n"); - goto init_func_tbl_err; - } - return 0;
-init_func_tbl_err: - spnic_vf_func_free(nic_cfg); - vf_init_err: sphw_aeq_unregister_swe_cb(hwdev, SPHW_STATELESS_EVENT);
register_sw_aeqe_err: get_feature_err: +init_func_tbl_err: sphw_unregister_service_adapter(hwdev, SERVICE_T_NIC);
register_sa_err: diff --git a/drivers/net/ethernet/ramaxel/spnic/spnic_nic_cfg.h b/drivers/net/ethernet/ramaxel/spnic/spnic_nic_cfg.h index 0872a2f94655..8be8f5334f72 100644 --- a/drivers/net/ethernet/ramaxel/spnic/spnic_nic_cfg.h +++ b/drivers/net/ethernet/ramaxel/spnic/spnic_nic_cfg.h @@ -403,22 +403,6 @@ int spnic_set_vf_link_state(void *hwdev, u16 vf_id, int link); */ int spnic_get_port_info(void *hwdev, struct nic_port_info *port_info, u16 channel);
-/* * - * @brief spnic_rss_template_alloc - alloc rss template table - * @param hwdev: device pointer to hwdev - * @retval zero: success - * @retval non-zero: failure - */ -int spnic_rss_template_alloc(void *hwdev); - -/* * - * @brief spnic_rss_template_free - free rss template table - * @param hwdev: device pointer to hwdev - * @retval zero: success - * @retval non-zero: failure - */ -int spnic_rss_template_free(void *hwdev); - /* * * @brief spnic_set_rss_type - set rss type * @param hwdev: device pointer to hwdev diff --git a/drivers/net/ethernet/ramaxel/spnic/spnic_rss.c b/drivers/net/ethernet/ramaxel/spnic/spnic_rss.c index 86f6f92f669b..956d868df5b5 100644 --- a/drivers/net/ethernet/ramaxel/spnic/spnic_rss.c +++ b/drivers/net/ethernet/ramaxel/spnic/spnic_rss.c @@ -224,7 +224,7 @@ void spnic_try_to_enable_rss(struct spnic_nic_dev *nic_dev) return;
nic_dev->max_qps = sphw_func_max_nic_qnum(nic_dev->hwdev); - if (nic_dev->max_qps <= 1) + if (nic_dev->max_qps <= 1 || !SPNIC_SUPPORT_RSS(nic_dev->hwdev)) goto set_q_params;
err = alloc_rss_resource(nic_dev); @@ -233,17 +233,6 @@ void spnic_try_to_enable_rss(struct spnic_nic_dev *nic_dev) goto set_q_params; }
- err = spnic_rss_template_alloc(nic_dev->hwdev); - if (err) { - if (err == -ENOSPC) - nic_err(&nic_dev->pdev->dev, "Failed to alloc template for rss, table is full\n"); - else - nic_err(&nic_dev->pdev->dev, "Failed to alloc template for rss, can't enable rss for this function\n"); - spnic_clear_rss_config(nic_dev); - nic_dev->max_qps = 1; - goto set_q_params; - } - set_bit(SPNIC_RSS_ENABLE, &nic_dev->flags); nic_dev->max_qps = sphw_func_max_nic_qnum(nic_dev->hwdev);
@@ -258,9 +247,6 @@ void spnic_try_to_enable_rss(struct spnic_nic_dev *nic_dev) nic_err(&nic_dev->pdev->dev, "Failed to set hardware rss parameters\n");
spnic_clear_rss_config(nic_dev); - err = spnic_rss_template_free(nic_dev->hwdev); - if (err) - return; nic_dev->max_qps = 1; goto set_q_params; } diff --git a/drivers/net/ethernet/ramaxel/spnic/spnic_rss_cfg.c b/drivers/net/ethernet/ramaxel/spnic/spnic_rss_cfg.c index b5be80cc304f..12da3aa59400 100644 --- a/drivers/net/ethernet/ramaxel/spnic/spnic_rss_cfg.c +++ b/drivers/net/ethernet/ramaxel/spnic/spnic_rss_cfg.c @@ -18,63 +18,6 @@ #include "spnic_nic.h" #include "sphw_common.h"
-int spnic_rss_template_alloc(void *hwdev) -{ - struct spnic_rss_template_mgmt template_mgmt; - struct spnic_nic_cfg *nic_cfg = NULL; - u16 out_size = sizeof(template_mgmt); - int err; - - if (!hwdev) - return -EINVAL; - - nic_cfg = sphw_get_service_adapter(hwdev, SERVICE_T_NIC); - - memset(&template_mgmt, 0, sizeof(struct spnic_rss_template_mgmt)); - - template_mgmt.func_id = sphw_global_func_id(hwdev); - template_mgmt.cmd = NIC_RSS_CMD_TEMP_ALLOC; - - err = l2nic_msg_to_mgmt_sync(hwdev, SPNIC_NIC_CMD_RSS_TEMP_MGR, - &template_mgmt, sizeof(template_mgmt), - &template_mgmt, &out_size); - if (err || !out_size || template_mgmt.msg_head.status) { - nic_err(nic_cfg->dev_hdl, "Failed to alloc rss template, err: %d, status: 0x%x, out size: 0x%x\n", - err, template_mgmt.msg_head.status, out_size); - return -EINVAL; - } - - return 0; -} - -int spnic_rss_template_free(void *hwdev) -{ - struct spnic_rss_template_mgmt template_mgmt; - struct spnic_nic_cfg *nic_cfg = NULL; - u16 out_size = sizeof(template_mgmt); - int err; - - if (!hwdev) - return -EINVAL; - - nic_cfg = sphw_get_service_adapter(hwdev, SERVICE_T_NIC); - memset(&template_mgmt, 0, sizeof(struct spnic_rss_template_mgmt)); - - template_mgmt.func_id = sphw_global_func_id(hwdev); - template_mgmt.cmd = NIC_RSS_CMD_TEMP_FREE; - - err = l2nic_msg_to_mgmt_sync(hwdev, SPNIC_NIC_CMD_RSS_TEMP_MGR, - &template_mgmt, sizeof(template_mgmt), - &template_mgmt, &out_size); - if (err || !out_size || template_mgmt.msg_head.status) { - nic_err(nic_cfg->dev_hdl, "Failed to free rss template, err: %d, status: 0x%x, out size: 0x%x\n", - err, template_mgmt.msg_head.status, out_size); - return -EINVAL; - } - - return 0; -} - static int spnic_rss_cfg_hash_key(struct spnic_nic_cfg *nic_cfg, u8 opcode, u8 *key) { struct spnic_cmd_rss_hash_key hash_key;
From: Yanling Song songyl@ramaxel.com
Ramaxel inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4OENF CVE: NA
Driver and firmware properties negotiate optimization. After obtaining the attributes from the firmware. It intersects with the attributes supported by the driver.
Signed-off-by: Yanling Song songyl@ramaxel.com Reviewed-by: Yang Gan yanggan@ramaxel.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../ethernet/ramaxel/spnic/hw/sphw_hwdev.c | 9 +++- .../ethernet/ramaxel/spnic/hw/sphw_hwdev.h | 2 + .../net/ethernet/ramaxel/spnic/spnic_main.c | 16 ++++---- .../ethernet/ramaxel/spnic/spnic_nic_cfg.c | 41 +++++++++++++------ .../ethernet/ramaxel/spnic/spnic_nic_cfg.h | 3 +- .../ethernet/ramaxel/spnic/spnic_nic_dev.h | 2 + 6 files changed, 50 insertions(+), 23 deletions(-)
diff --git a/drivers/net/ethernet/ramaxel/spnic/hw/sphw_hwdev.c b/drivers/net/ethernet/ramaxel/spnic/hw/sphw_hwdev.c index c88799bcda98..fc7d101a7fe3 100644 --- a/drivers/net/ethernet/ramaxel/spnic/hw/sphw_hwdev.c +++ b/drivers/net/ethernet/ramaxel/spnic/hw/sphw_hwdev.c @@ -699,7 +699,8 @@ static void sphw_unsync_mgmt_func_state(struct sphw_hwdev *hwdev)
static int init_basic_attributes(struct sphw_hwdev *hwdev) { - int err; + u64 drv_feature[COMM_MAX_FEATURE_QWORD] = {SPHW_DRV_FEATURE_QW0}; + int err, i;
err = sphw_get_board_info(hwdev, &hwdev->board_info, SPHW_CHANNEL_COMM); if (err) @@ -711,7 +712,11 @@ static int init_basic_attributes(struct sphw_hwdev *hwdev) return err; }
- sdk_info(hwdev->dev_hdl, "Comm features: 0x%llx\n", hwdev->features[0]); + sdk_info(hwdev->dev_hdl, "Comm hw features: 0x%llx, drv features: 0x%llx\n", + hwdev->features[0], drv_feature[0]); + + for (i = 0; i < COMM_MAX_FEATURE_QWORD; i++) + hwdev->features[i] &= drv_feature[i];
err = sphw_get_global_attr(hwdev, &hwdev->glb_attr); if (err) { diff --git a/drivers/net/ethernet/ramaxel/spnic/hw/sphw_hwdev.h b/drivers/net/ethernet/ramaxel/spnic/hw/sphw_hwdev.h index 451df0306e6c..bf0b93d9a4d2 100644 --- a/drivers/net/ethernet/ramaxel/spnic/hw/sphw_hwdev.h +++ b/drivers/net/ethernet/ramaxel/spnic/hw/sphw_hwdev.h @@ -90,4 +90,6 @@ struct sphw_hwdev { #define COMM_FEATURE_QW0(hwdev, feature) ((hwdev)->features[0] & COMM_F_##feature) #define COMM_SUPPORT_API_CHAIN(hwdev) COMM_FEATURE_QW0(hwdev, API_CHAIN)
+#define SPHW_DRV_FEATURE_QW0 COMM_F_API_CHAIN + #endif diff --git a/drivers/net/ethernet/ramaxel/spnic/spnic_main.c b/drivers/net/ethernet/ramaxel/spnic/spnic_main.c index 3670d92eaf2a..5b9822ebd806 100644 --- a/drivers/net/ethernet/ramaxel/spnic/spnic_main.c +++ b/drivers/net/ethernet/ramaxel/spnic/spnic_main.c @@ -356,9 +356,13 @@ static void spnic_sw_deinit(struct spnic_nic_dev *nic_dev) static int spnic_sw_init(struct spnic_nic_dev *nic_dev) { struct net_device *netdev = nic_dev->netdev; - + u64 nic_feature; int err = 0;
+ nic_feature = spnic_get_feature_cap(nic_dev->hwdev); + nic_feature &= SPNIC_DRV_FEATURE; + spnic_update_nic_feature(nic_dev->hwdev, nic_feature); + sema_init(&nic_dev->port_state_sem, 1);
err = spnic_dcb_init(nic_dev); @@ -652,7 +656,6 @@ static int setup_nic_dev(struct net_device *netdev, struct spnic_lld_dev *lld_de
static int spnic_set_default_hw_feature(struct spnic_nic_dev *nic_dev) { - u64 nic_features; int err;
if (!SPNIC_FUNC_IS_VF(nic_dev->hwdev)) { @@ -663,8 +666,7 @@ static int spnic_set_default_hw_feature(struct spnic_nic_dev *nic_dev) } }
- nic_features = spnic_get_feature_cap(nic_dev->hwdev); - err = spnic_set_nic_feature(nic_dev->hwdev, &nic_features, 1); + err = spnic_set_nic_feature_to_hw(nic_dev->hwdev); if (err) { nic_err(&nic_dev->pdev->dev, "Failed to set nic features\n"); return err; @@ -729,13 +731,13 @@ static int nic_probe(struct spnic_lld_dev *lld_dev, void **uld_dev, char *uld_de goto init_nic_hwdev_err; }
- spnic_assign_netdev_ops(nic_dev); - netdev_feature_init(netdev); - err = spnic_sw_init(nic_dev); if (err) goto sw_init_err;
+ spnic_assign_netdev_ops(nic_dev); + netdev_feature_init(netdev); + err = spnic_set_default_hw_feature(nic_dev); if (err) goto set_features_err; diff --git a/drivers/net/ethernet/ramaxel/spnic/spnic_nic_cfg.c b/drivers/net/ethernet/ramaxel/spnic/spnic_nic_cfg.c index 24e55c087da5..d241f6a7947d 100644 --- a/drivers/net/ethernet/ramaxel/spnic/spnic_nic_cfg.c +++ b/drivers/net/ethernet/ramaxel/spnic/spnic_nic_cfg.c @@ -795,14 +795,38 @@ static int nic_feature_nego(void *hwdev, u8 opcode, u64 *s_feature, u16 size) return 0; }
-int spnic_get_nic_feature(void *hwdev, u64 *s_feature, u16 size) +static int spnic_get_nic_feature_from_hw(void *hwdev, u64 *s_feature, u16 size) { return nic_feature_nego(hwdev, SPNIC_CMD_OP_GET, s_feature, size); }
-int spnic_set_nic_feature(void *hwdev, u64 *s_feature, u16 size) +int spnic_set_nic_feature_to_hw(void *hwdev) { - return nic_feature_nego(hwdev, SPNIC_CMD_OP_SET, s_feature, size); + struct spnic_nic_cfg *nic_cfg = NULL; + + nic_cfg = sphw_get_service_adapter(hwdev, SERVICE_T_NIC); + + return nic_feature_nego(hwdev, SPNIC_CMD_OP_SET, &nic_cfg->feature_cap, 1); +} + +u64 spnic_get_feature_cap(void *hwdev) +{ + struct spnic_nic_cfg *nic_cfg = NULL; + + nic_cfg = sphw_get_service_adapter(hwdev, SERVICE_T_NIC); + + return nic_cfg->feature_cap; +} + +void spnic_update_nic_feature(void *hwdev, u64 feature) +{ + struct spnic_nic_cfg *nic_cfg = NULL; + + nic_cfg = sphw_get_service_adapter(hwdev, SERVICE_T_NIC); + + nic_cfg->feature_cap = feature; + + nic_info(nic_cfg->dev_hdl, "Update nic feature to 0x%llx\n", nic_cfg->feature_cap); }
static inline int init_nic_hwdev_param_valid(void *hwdev, void *pcidev_hdl, void *dev_hdl) @@ -850,7 +874,7 @@ int spnic_init_nic_hwdev(void *hwdev, void *pcidev_hdl, void *dev_hdl, u16 rx_bu goto init_func_tbl_err; }
- err = spnic_get_nic_feature(hwdev, &nic_cfg->feature_cap, 1); + err = spnic_get_nic_feature_from_hw(hwdev, &nic_cfg->feature_cap, 1); if (err) { nic_err(nic_cfg->dev_hdl, "Failed to get nic features\n"); goto get_feature_err; @@ -1128,15 +1152,6 @@ int spnic_set_vlan_fliter(void *hwdev, u32 vlan_filter_ctrl) return 0; }
-u64 spnic_get_feature_cap(void *hwdev) -{ - struct spnic_nic_cfg *nic_cfg = NULL; - - nic_cfg = sphw_get_service_adapter(hwdev, SERVICE_T_NIC); - - return nic_cfg->feature_cap; -} - int spnic_add_tcam_rule(void *hwdev, struct nic_tcam_cfg_rule *tcam_rule) { u16 out_size = sizeof(struct nic_cmd_fdir_add_rule); diff --git a/drivers/net/ethernet/ramaxel/spnic/spnic_nic_cfg.h b/drivers/net/ethernet/ramaxel/spnic/spnic_nic_cfg.h index 8be8f5334f72..f280b41fe362 100644 --- a/drivers/net/ethernet/ramaxel/spnic/spnic_nic_cfg.h +++ b/drivers/net/ethernet/ramaxel/spnic/spnic_nic_cfg.h @@ -703,6 +703,7 @@ int spnic_set_autoneg(void *hwdev, bool enable); int spnic_get_sfp_type(void *hwdev, u8 *sfp_type, u8 *sfp_type_ext); int spnic_get_sfp_eeprom(void *hwdev, u8 *data, u32 len);
-int spnic_set_nic_feature(void *hwdev, u64 *s_feature, u16 size); +int spnic_set_nic_feature_to_hw(void *hwdeve); +void spnic_update_nic_feature(void *hwdev, u64 feature);
#endif diff --git a/drivers/net/ethernet/ramaxel/spnic/spnic_nic_dev.h b/drivers/net/ethernet/ramaxel/spnic/spnic_nic_dev.h index 37068f1b48f2..8a0708fda19a 100644 --- a/drivers/net/ethernet/ramaxel/spnic/spnic_nic_dev.h +++ b/drivers/net/ethernet/ramaxel/spnic/spnic_nic_dev.h @@ -54,6 +54,8 @@ enum spnic_flags { #define VID_LINE(nic_dev, vid) ((vid) / VLAN_BITMAP_BITS_SIZE(nic_dev)) #define VID_COL(nic_dev, vid) ((vid) & (VLAN_BITMAP_BITS_SIZE(nic_dev) - 1))
+#define SPNIC_DRV_FEATURE NIC_F_ALL_MASK + enum spnic_event_work_flags { EVENT_WORK_TX_TIMEOUT, };
From: Yanling Song songyl@ramaxel.com
Ramaxel inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4OENF CVE: NA
Rearragne the order and add some flag bits for future use.
Signed-off-by: Yanling Song songyl@ramaxel.com Reviewed-by: Yang Gan yanggan@ramaxel.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../net/ethernet/ramaxel/spnic/hw/sphw_hw.h | 28 ++++++++----------- 1 file changed, 12 insertions(+), 16 deletions(-)
diff --git a/drivers/net/ethernet/ramaxel/spnic/hw/sphw_hw.h b/drivers/net/ethernet/ramaxel/spnic/hw/sphw_hw.h index 2fc99c5f6096..74607ff24f09 100644 --- a/drivers/net/ethernet/ramaxel/spnic/hw/sphw_hw.h +++ b/drivers/net/ethernet/ramaxel/spnic/hw/sphw_hw.h @@ -605,28 +605,24 @@ void sphw_event_callback(void *hwdev, struct sphw_event_info *event); void sphw_link_event_stats(void *dev, u8 link);
enum func_reset_flag { - RES_TYPE_COMM = 0, - RES_TYPE_NIC = 1, - RES_TYPE_OVS = 2, - RES_TYPE_VBS = 3, - RES_TYPE_ROCE = 4, - RES_TYPE_FC = 5, - RES_TYPE_TOE = 6, - - RES_TYPE_FLUSH_BIT = 7, + RES_TYPE_FLUSH_BIT = 0, RES_TYPE_MQM, RES_TYPE_SMF, - RES_TYPE_CMDQ_ROOTCTX, - RES_TYPE_SQ_CI_TABLE, - RES_TYPE_PF_BW_CFG, - RES_TYPE_CEQ, - RES_TYPE_MBOX, - RES_TYPE_AEQ, + + RES_TYPE_COMM = 10, + RES_TYPE_COMM_MGMT_CH, + RES_TYPE_COMM_CMD_CH, + RES_TYPE_NIC, + RES_TYPE_OVS, + RES_TYPE_VBS, + RES_TYPE_ROCE, + RES_TYPE_FC, + RES_TYPE_TOE, RES_TYPE_IPSEC, };
#define SPHW_COMM_RES (BIT(RES_TYPE_COMM) | BIT(RES_TYPE_FLUSH_BIT) | BIT(RES_TYPE_MQM) | \ - BIT(RES_TYPE_SMF) | BIT(RES_TYPE_CMDQ_ROOTCTX)) + BIT(RES_TYPE_SMF) | BIT(RES_TYPE_COMM_CMD_CH))
#define SPHW_NIC_RES BIT(RES_TYPE_NIC) #define SPHW_FC_RES BIT(RES_TYPE_FC)
From: Wang Wensheng wangwensheng4@huawei.com
ascend inclusion category: Feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NDAW CVE: NA
-------------------
Add this configs in openeuler_defconfig. CONFIG_ASCEND_CHARGE_MIGRATE_HUGEPAGES=y CONFIG_ASCEND_SHARE_POOL=y
Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/configs/openeuler_defconfig | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 0a0508e8a7f1..dfbfa10a6097 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -476,6 +476,8 @@ CONFIG_RANDOMIZE_MODULE_REGION_FULL=y CONFIG_ASCEND_FEATURES=y CONFIG_ASCEND_DVPP_MMAP=y CONFIG_ASCEND_CLEAN_CDM=y +CONFIG_ASCEND_CHARGE_MIGRATE_HUGEPAGES=y +CONFIG_ASCEND_SHARE_POOL=y CONFIG_ASCEND_OOM=y # end of Kernel Features
From: Like Xu like.xu@linux.intel.com
mainline inclusion from mainline-v5.12-rc1 commit d855066f81726155caf766e47eea58ae10b1fd57 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NP0K CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------
SVM already has specific handlers of MSR_IA32_DEBUGCTLMSR in the svm_get/set_msr, so the x86 common part can be safely moved to VMX. This allows KVM to store the bits it supports in GUEST_IA32_DEBUGCTL.
Add vmx_supported_debugctl() to refactor the throwing logic of #GP.
Signed-off-by: Like Xu like.xu@linux.intel.com Reviewed-by: Andi Kleen ak@linux.intel.com Message-Id: 20210108013704.134985-2-like.xu@linux.intel.com [Merge parts of Chenyi Qiang's "KVM: X86: Expose bus lock debug exception to guest". - Paolo] Signed-off-by: Paolo Bonzini pbonzini@redhat.com Reviewed-by: WangJian wangjian161@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/kvm/vmx/capabilities.h | 5 +++++ arch/x86/kvm/vmx/vmx.c | 23 +++++++++++++++++++---- arch/x86/kvm/x86.c | 16 ++-------------- arch/x86/kvm/x86.h | 2 ++ 4 files changed, 28 insertions(+), 18 deletions(-)
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 3a1861403d73..cc44ba8e31c7 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -378,4 +378,9 @@ static inline u64 vmx_get_perf_capabilities(void) return PMU_CAP_FW_WRITES; }
+static inline u64 vmx_supported_debugctl(void) +{ + return 0; +} + #endif /* __KVM_X86_VMX_CAPS_H */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index cfd24e019076..7af495b450fd 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1934,6 +1934,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) return 1; goto find_uret_msr; + case MSR_IA32_DEBUGCTLMSR: + msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL); + break; default: find_uret_msr: msr = vmx_find_uret_msr(vmx, msr_info->index); @@ -2007,14 +2010,26 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } vmcs_writel(GUEST_SYSENTER_ESP, data); break; - case MSR_IA32_DEBUGCTLMSR: + case MSR_IA32_DEBUGCTLMSR: { + u64 invalid = data & ~vmx_supported_debugctl(); + if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) { + if (report_ignored_msrs) + vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n", + __func__, data); + data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); + invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); + } + + if (invalid) + return 1; + if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) get_vmcs12(vcpu)->guest_ia32_debugctl = data;
- ret = kvm_set_msr_common(vcpu, msr_info); - break; - + vmcs_write64(GUEST_IA32_DEBUGCTL, data); + return 0; + } case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported() || (!msr_info->host_initiated && diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 05f41234687a..9041ffa44baf 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -117,8 +117,9 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops); static bool __read_mostly ignore_msrs = 0; module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
-static bool __read_mostly report_ignored_msrs = true; +bool __read_mostly report_ignored_msrs = true; module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR); +EXPORT_SYMBOL_GPL(report_ignored_msrs);
unsigned int min_timer_period_us = 200; module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); @@ -3164,18 +3165,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; } break; - case MSR_IA32_DEBUGCTLMSR: - if (!data) { - /* We support the non-activated case already */ - break; - } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) { - /* Values other than LBR and BTF are vendor-specific, - thus reserved and should throw a #GP */ - return 1; - } else if (report_ignored_msrs) - vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", - __func__, data); - break; case 0x200 ... 0x2ff: return kvm_mtrr_set_msr(vcpu, msr, data); case MSR_IA32_APICBASE: @@ -3452,7 +3441,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) switch (msr_info->index) { case MSR_IA32_PLATFORM_ID: case MSR_IA32_EBL_CR_POWERON: - case MSR_IA32_DEBUGCTLMSR: case MSR_IA32_LASTBRANCHFROMIP: case MSR_IA32_LASTBRANCHTOIP: case MSR_IA32_LASTINTFROMIP: diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 2bff44f1efec..81ea01462175 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -296,6 +296,8 @@ extern int pi_inject_timer;
extern struct static_key kvm_no_apic_vcpu;
+extern bool report_ignored_msrs; + static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) { return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
From: Like Xu like.xu@linux.intel.com
mainline inclusion from mainline-v5.12-rc1 commit 252e365eb28ddf49eb31cec1a5d99e708c73f57b category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NP0K CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------
To make code responsibilities clear, we may resue and invoke the vmx_set_intercept_for_msr() in other vmx-specific files (e.g. pmu_intel.c), so expose it to passthrough LBR msrs later.
Signed-off-by: Like Xu like.xu@linux.intel.com Reviewed-by: Andi Kleen ak@linux.intel.com Message-Id: 20210201051039.255478-2-like.xu@linux.intel.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Reviewed-by: WangJian wangjian161@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/vmx/vmx.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 7af495b450fd..fa43e807e102 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3801,7 +3801,7 @@ static __always_inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, vmx_set_msr_bitmap_write(msr_bitmap, msr); }
-static __always_inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, +void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool value) { if (value) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 5ff24537393e..1b6ca097515f 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -365,6 +365,8 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu); void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp); int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr); void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu); +void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, + u32 msr, int type, bool value);
static inline u8 vmx_get_rvi(void) {
From: Paolo Bonzini pbonzini@redhat.com
mainline inclusion from mainline-v5.12-rc1 commit a755753903a40d982f6dd23d65eb96b248a2577a category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NP0K CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------
Once MSR_IA32_PERF_CAPABILITIES is changed via vmx_set_msr(), the value should not be changed by cpuid(). To ensure that the new value is kept, the default initialization path is moved to intel_pmu_init(). The effective value of the MSR will be 0 if PDCM is clear, however.
Signed-off-by: Paolo Bonzini pbonzini@redhat.com Reviewed-by: WangJian wangjian161@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/kvm/vmx/pmu_intel.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-)
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index cdf5f34518f4..7403d46998d6 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -152,12 +152,17 @@ static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, return &counters[array_index_nospec(idx, num_counters)]; }
-static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu) +static inline u64 vcpu_get_perf_capabilities(struct kvm_vcpu *vcpu) { if (!guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) - return false; + return 0;
- return vcpu->arch.perf_capabilities & PMU_CAP_FW_WRITES; + return vcpu->arch.perf_capabilities; +} + +static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu) +{ + return (vcpu_get_perf_capabilities(vcpu) & PMU_CAP_FW_WRITES) != 0; }
static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr) @@ -327,7 +332,6 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->counter_bitmask[KVM_PMC_FIXED] = 0; pmu->version = 0; pmu->reserved_bits = 0xffffffff00200000ull; - vcpu->arch.perf_capabilities = 0;
entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); if (!entry) @@ -340,8 +344,6 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) return;
perf_get_x86_pmu_capability(&x86_pmu); - if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) - vcpu->arch.perf_capabilities = vmx_get_perf_capabilities();
pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters, x86_pmu.num_counters_gp); @@ -405,6 +407,8 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu) pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED; pmu->fixed_counters[i].current_config = 0; } + + vcpu->arch.perf_capabilities = vmx_get_perf_capabilities(); }
static void intel_pmu_reset(struct kvm_vcpu *vcpu)
From: Paolo Bonzini pbonzini@redhat.com
mainline inclusion from mainline-v5.12-rc1 commit 9c9520ce883386dc3794c7d60204487ff1db09cb category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NP0K CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------
Usespace could set the bits [0, 5] of the IA32_PERF_CAPABILITIES MSR which tells about the record format stored in the LBR records.
The LBR will be enabled on the guest if host perf supports LBR (checked via x86_perf_get_lbr()) and the vcpu model is compatible with the host one.
Signed-off-by: Like Xu like.xu@linux.intel.com Message-Id: 20210201051039.255478-4-like.xu@linux.intel.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Reviewed-by: WangJian wangjian161@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/kvm/vmx/capabilities.h | 1 + arch/x86/kvm/vmx/pmu_intel.c | 19 +++++++++++++++++++ arch/x86/kvm/vmx/vmx.c | 12 ++++++++++++ arch/x86/kvm/vmx/vmx.h | 11 +++++++++++ 4 files changed, 43 insertions(+)
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index cc44ba8e31c7..db3cec21128e 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -19,6 +19,7 @@ extern int __read_mostly pt_mode; #define PT_MODE_HOST_GUEST 1
#define PMU_CAP_FW_WRITES (1ULL << 13) +#define PMU_CAP_LBR_FMT 0x3f
struct nested_vmx_msrs { /* diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 7403d46998d6..d21104e6f9ec 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -173,6 +173,16 @@ static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr) return get_gp_pmc(pmu, msr, MSR_IA32_PMC0); }
+bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu) +{ + /* + * As a first step, a guest could only enable LBR feature if its + * cpu model is the same as the host because the LBR registers + * would be pass-through to the guest and they're model specific. + */ + return boot_cpu_data.x86_model == guest_cpuid_model(vcpu); +} + static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -321,6 +331,8 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) static void intel_pmu_refresh(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + struct x86_pmu_capability x86_pmu; struct kvm_cpuid_entry2 *entry; union cpuid10_eax eax; @@ -387,12 +399,18 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters);
nested_vmx_pmu_entry_exit_ctls_update(vcpu); + + if (intel_pmu_lbr_is_compatible(vcpu)) + x86_perf_get_lbr(&lbr_desc->records); + else + lbr_desc->records.nr = 0; }
static void intel_pmu_init(struct kvm_vcpu *vcpu) { int i; struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { pmu->gp_counters[i].type = KVM_PMC_GP; @@ -409,6 +427,7 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu) }
vcpu->arch.perf_capabilities = vmx_get_perf_capabilities(); + lbr_desc->records.nr = 0; }
static void intel_pmu_reset(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index fa43e807e102..abc241395ff7 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2221,6 +2221,18 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if ((data >> 32) != 0) return 1; goto find_uret_msr; + case MSR_IA32_PERF_CAPABILITIES: + if (data && !vcpu_to_pmu(vcpu)->version) + return 1; + if (data & PMU_CAP_LBR_FMT) { + if ((data & PMU_CAP_LBR_FMT) != + (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT)) + return 1; + if (!intel_pmu_lbr_is_compatible(vcpu)) + return 1; + } + ret = kvm_set_msr_common(vcpu, msr_info); + break;
default: find_uret_msr: diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 1b6ca097515f..cb64c2901b10 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -93,6 +93,16 @@ union vmx_exit_reason { u32 full; };
+#define vcpu_to_lbr_desc(vcpu) (&to_vmx(vcpu)->lbr_desc) +#define vcpu_to_lbr_records(vcpu) (&to_vmx(vcpu)->lbr_desc.records) + +bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu); + +struct lbr_desc { + /* Basic info about guest LBR records. */ + struct x86_pmu_lbr records; +}; + /* * The nested_vmx structure is part of vcpu_vmx, and holds information we need * for correct emulation of VMX (i.e., nested VMX) on this vcpu. @@ -302,6 +312,7 @@ struct vcpu_vmx { u64 ept_pointer;
struct pt_desc pt_desc; + struct lbr_desc lbr_desc;
/* Save desired MSR intercept (read: pass-through) state */ #define MAX_POSSIBLE_PASSTHROUGH_MSRS 13
From: Like Xu like.xu@linux.intel.com
mainline inclusion from mainline-v5.12-rc1 commit c646236344e9054cc84cd5a9f763163b9654cf7e category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NP0K CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------
Usespace could set the bits [0, 5] of the IA32_PERF_CAPABILITIES MSR which tells about the record format stored in the LBR records.
The LBR will be enabled on the guest if host perf supports LBR (checked via x86_perf_get_lbr()) and the vcpu model is compatible with the host one.
Signed-off-by: Like Xu like.xu@linux.intel.com Message-Id: 20210201051039.255478-4-like.xu@linux.intel.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Reviewed-by: WangJian wangjian161@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/kvm/vmx/capabilities.h | 7 ++++++- arch/x86/kvm/vmx/pmu_intel.c | 7 +++++++ arch/x86/kvm/vmx/vmx.c | 12 +++++++++++- arch/x86/kvm/vmx/vmx.h | 1 + 4 files changed, 25 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index db3cec21128e..e836c86dfc15 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -381,7 +381,12 @@ static inline u64 vmx_get_perf_capabilities(void)
static inline u64 vmx_supported_debugctl(void) { - return 0; + u64 debugctl = 0; + + if (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT) + debugctl |= DEBUGCTLMSR_LBR; + + return debugctl; }
#endif /* __KVM_X86_VMX_CAPS_H */ diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index d21104e6f9ec..48529dd127bb 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -183,6 +183,13 @@ bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu) return boot_cpu_data.x86_model == guest_cpuid_model(vcpu); }
+bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu) +{ + struct x86_pmu_lbr *lbr = vcpu_to_lbr_records(vcpu); + + return lbr->nr && (vcpu_get_perf_capabilities(vcpu) & PMU_CAP_LBR_FMT); +} + static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index abc241395ff7..53481ace2c4f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1960,6 +1960,16 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, return (unsigned long)data; }
+static u64 vcpu_supported_debugctl(struct kvm_vcpu *vcpu) +{ + u64 debugctl = vmx_supported_debugctl(); + + if (!intel_pmu_lbr_is_enabled(vcpu)) + debugctl &= ~DEBUGCTLMSR_LBR; + + return debugctl; +} + /* * Writes msr value into the appropriate "register". * Returns 0 on success, non-0 otherwise. @@ -2011,7 +2021,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmcs_writel(GUEST_SYSENTER_ESP, data); break; case MSR_IA32_DEBUGCTLMSR: { - u64 invalid = data & ~vmx_supported_debugctl(); + u64 invalid = data & ~vcpu_supported_debugctl(vcpu); if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) { if (report_ignored_msrs) vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n", diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index cb64c2901b10..c7a7234270eb 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -97,6 +97,7 @@ union vmx_exit_reason { #define vcpu_to_lbr_records(vcpu) (&to_vmx(vcpu)->lbr_desc.records)
bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu); +bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu);
struct lbr_desc { /* Basic info about guest LBR records. */
From: Like Xu like.xu@linux.intel.com
mainline inclusion from mainline-v5.12-rc1 commit 8e12911b243e485f5e4c7c5fbc79cdf185728700 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NP0K CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------
When vcpu sets DEBUGCTLMSR_LBR in the MSR_IA32_DEBUGCTLMSR, the KVM handler would create a guest LBR event which enables the callstack mode and none of hardware counter is assigned. The host perf would schedule and enable this event as usual but in an exclusive way.
The guest LBR event will be released when the vPMU is reset but soon, the lazy release mechanism would be applied to this event like a vPMC.
Suggested-by: Andi Kleen ak@linux.intel.com Co-developed-by: Wei Wang wei.w.wang@intel.com Signed-off-by: Wei Wang wei.w.wang@intel.com Signed-off-by: Like Xu like.xu@linux.intel.com Reviewed-by: Andi Kleen ak@linux.intel.com Message-Id: 20210201051039.255478-6-like.xu@linux.intel.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Reviewed-by: WangJian wangjian161@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/kvm/vmx/pmu_intel.c | 63 ++++++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/vmx.c | 3 ++ arch/x86/kvm/vmx/vmx.h | 10 ++++++ 3 files changed, 76 insertions(+)
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 48529dd127bb..dbaa175a3c2e 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -224,6 +224,66 @@ static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr) return pmc; }
+static inline void intel_pmu_release_guest_lbr_event(struct kvm_vcpu *vcpu) +{ + struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + + if (lbr_desc->event) { + perf_event_release_kernel(lbr_desc->event); + lbr_desc->event = NULL; + vcpu_to_pmu(vcpu)->event_count--; + } +} + +int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu) +{ + struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct perf_event *event; + + /* + * The perf_event_attr is constructed in the minimum efficient way: + * - set 'pinned = true' to make it task pinned so that if another + * cpu pinned event reclaims LBR, the event->oncpu will be set to -1; + * - set '.exclude_host = true' to record guest branches behavior; + * + * - set '.config = INTEL_FIXED_VLBR_EVENT' to indicates host perf + * schedule the event without a real HW counter but a fake one; + * check is_guest_lbr_event() and __intel_get_event_constraints(); + * + * - set 'sample_type = PERF_SAMPLE_BRANCH_STACK' and + * 'branch_sample_type = PERF_SAMPLE_BRANCH_CALL_STACK | + * PERF_SAMPLE_BRANCH_USER' to configure it as a LBR callstack + * event, which helps KVM to save/restore guest LBR records + * during host context switches and reduces quite a lot overhead, + * check branch_user_callstack() and intel_pmu_lbr_sched_task(); + */ + struct perf_event_attr attr = { + .type = PERF_TYPE_RAW, + .size = sizeof(attr), + .config = INTEL_FIXED_VLBR_EVENT, + .sample_type = PERF_SAMPLE_BRANCH_STACK, + .pinned = true, + .exclude_host = true, + .branch_sample_type = PERF_SAMPLE_BRANCH_CALL_STACK | + PERF_SAMPLE_BRANCH_USER, + }; + + if (unlikely(lbr_desc->event)) + return 0; + + event = perf_event_create_kernel_counter(&attr, -1, + current, NULL, NULL); + if (IS_ERR(event)) { + pr_debug_ratelimited("%s: failed %ld\n", + __func__, PTR_ERR(event)); + return -ENOENT; + } + lbr_desc->event = event; + pmu->event_count++; + return 0; +} + static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -435,6 +495,7 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu)
vcpu->arch.perf_capabilities = vmx_get_perf_capabilities(); lbr_desc->records.nr = 0; + lbr_desc->event = NULL; }
static void intel_pmu_reset(struct kvm_vcpu *vcpu) @@ -459,6 +520,8 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = pmu->global_ovf_ctrl = 0; + + intel_pmu_release_guest_lbr_event(vcpu); }
struct kvm_pmu_ops intel_pmu_ops = { diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 53481ace2c4f..1c692d2329c5 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2038,6 +2038,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) get_vmcs12(vcpu)->guest_ia32_debugctl = data;
vmcs_write64(GUEST_IA32_DEBUGCTL, data); + if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event && + (data & DEBUGCTLMSR_LBR)) + intel_pmu_create_guest_lbr_event(vcpu); return 0; } case MSR_IA32_BNDCFGS: diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index c7a7234270eb..f1905eeaeafb 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -99,9 +99,19 @@ union vmx_exit_reason { bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu); bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu);
+int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu); + struct lbr_desc { /* Basic info about guest LBR records. */ struct x86_pmu_lbr records; + + /* + * Emulate LBR feature via passthrough LBR registers when the + * per-vcpu guest LBR event is scheduled on the current pcpu. + * + * The records may be inaccurate if the host reclaims the LBR. + */ + struct perf_event *event; };
/*
From: Like Xu like.xu@linux.intel.com
mainline inclusion from mainline-v5.12-rc1 commit 1b5ac3226a1aa071135fe0ee5d1055d9e88b717c category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NP0K CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------
In addition to DEBUGCTLMSR_LBR, any KVM trap caused by LBR msrs access will result in a creation of guest LBR event per-vcpu.
If the guest LBR event is scheduled on with the corresponding vcpu context, KVM will pass-through all LBR records msrs to the guest. The LBR callstack mechanism implemented in the host could help save/restore the guest LBR records during the event context switches, which reduces a lot of overhead if we save/restore tens of LBR msrs (e.g. 32 LBR records entries) in the much more frequent VMX transitions.
To avoid reclaiming LBR resources from any higher priority event on host, KVM would always check the exist of guest LBR event and its state before vm-entry as late as possible. A negative result would cancel the pass-through state, and it also prevents real registers accesses and potential data leakage. If host reclaims the LBR between two checks, the interception state and LBR records can be safely preserved due to native save/restore support from guest LBR event.
The KVM emits a pr_warn() when the LBR hardware is unavailable to the guest LBR event. The administer is supposed to reminder users that the guest result may be inaccurate if someone is using LBR to record hypervisor on the host side.
Suggested-by: Andi Kleen ak@linux.intel.com Co-developed-by: Wei Wang wei.w.wang@intel.com Signed-off-by: Wei Wang wei.w.wang@intel.com Signed-off-by: Like Xu like.xu@linux.intel.com Reviewed-by: Andi Kleen ak@linux.intel.com Message-Id: 20210201051039.255478-7-like.xu@linux.intel.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Reviewed-by: WangJian wangjian161@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/kvm/vmx/pmu_intel.c | 127 ++++++++++++++++++++++++++++++++++- arch/x86/kvm/vmx/vmx.c | 10 +++ arch/x86/kvm/vmx/vmx.h | 1 + 3 files changed, 135 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index dbaa175a3c2e..c7dbaaccbcaa 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -190,6 +190,24 @@ bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu) return lbr->nr && (vcpu_get_perf_capabilities(vcpu) & PMU_CAP_LBR_FMT); }
+static bool intel_pmu_is_valid_lbr_msr(struct kvm_vcpu *vcpu, u32 index) +{ + struct x86_pmu_lbr *records = vcpu_to_lbr_records(vcpu); + bool ret = false; + + if (!intel_pmu_lbr_is_enabled(vcpu)) + return ret; + + ret = (index == MSR_LBR_SELECT) || (index == MSR_LBR_TOS) || + (index >= records->from && index < records->from + records->nr) || + (index >= records->to && index < records->to + records->nr); + + if (!ret && records->info) + ret = (index >= records->info && index < records->info + records->nr); + + return ret; +} + static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -205,7 +223,8 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) default: ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) || - get_fixed_pmc(pmu, msr) || get_fw_gp_pmc(pmu, msr); + get_fixed_pmc(pmu, msr) || get_fw_gp_pmc(pmu, msr) || + intel_pmu_is_valid_lbr_msr(vcpu, msr); break; }
@@ -284,6 +303,46 @@ int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu) return 0; }
+/* + * It's safe to access LBR msrs from guest when they have not + * been passthrough since the host would help restore or reset + * the LBR msrs records when the guest LBR event is scheduled in. + */ +static bool intel_pmu_handle_lbr_msrs_access(struct kvm_vcpu *vcpu, + struct msr_data *msr_info, bool read) +{ + struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + u32 index = msr_info->index; + + if (!intel_pmu_is_valid_lbr_msr(vcpu, index)) + return false; + + if (!lbr_desc->event && !intel_pmu_create_guest_lbr_event(vcpu)) + goto dummy; + + /* + * Disable irq to ensure the LBR feature doesn't get reclaimed by the + * host at the time the value is read from the msr, and this avoids the + * host LBR value to be leaked to the guest. If LBR has been reclaimed, + * return 0 on guest reads. + */ + local_irq_disable(); + if (lbr_desc->event->state == PERF_EVENT_STATE_ACTIVE) { + if (read) + rdmsrl(index, msr_info->data); + else + wrmsrl(index, msr_info->data); + local_irq_enable(); + return true; + } + local_irq_enable(); + +dummy: + if (read) + msr_info->data = 0; + return true; +} + static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -318,7 +377,8 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { msr_info->data = pmc->eventsel; return 0; - } + } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, true)) + return 0; }
return 1; @@ -389,7 +449,8 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) reprogram_gp_counter(pmc, data); return 0; } - } + } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false)) + return 0; }
return 1; @@ -524,6 +585,66 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu) intel_pmu_release_guest_lbr_event(vcpu); }
+static void vmx_update_intercept_for_lbr_msrs(struct kvm_vcpu *vcpu, bool set) +{ + struct x86_pmu_lbr *lbr = vcpu_to_lbr_records(vcpu); + int i; + + for (i = 0; i < lbr->nr; i++) { + vmx_set_intercept_for_msr(vcpu, lbr->from + i, MSR_TYPE_RW, set); + vmx_set_intercept_for_msr(vcpu, lbr->to + i, MSR_TYPE_RW, set); + if (lbr->info) + vmx_set_intercept_for_msr(vcpu, lbr->info + i, MSR_TYPE_RW, set); + } + + vmx_set_intercept_for_msr(vcpu, MSR_LBR_SELECT, MSR_TYPE_RW, set); + vmx_set_intercept_for_msr(vcpu, MSR_LBR_TOS, MSR_TYPE_RW, set); +} + +static inline void vmx_disable_lbr_msrs_passthrough(struct kvm_vcpu *vcpu) +{ + vmx_update_intercept_for_lbr_msrs(vcpu, true); +} + +static inline void vmx_enable_lbr_msrs_passthrough(struct kvm_vcpu *vcpu) +{ + vmx_update_intercept_for_lbr_msrs(vcpu, false); +} + +/* + * Higher priority host perf events (e.g. cpu pinned) could reclaim the + * pmu resources (e.g. LBR) that were assigned to the guest. This is + * usually done via ipi calls (more details in perf_install_in_context). + * + * Before entering the non-root mode (with irq disabled here), double + * confirm that the pmu features enabled to the guest are not reclaimed + * by higher priority host events. Otherwise, disallow vcpu's access to + * the reclaimed features. + */ +void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu) +{ + struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + + if (!lbr_desc->event) { + vmx_disable_lbr_msrs_passthrough(vcpu); + if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR) + goto warn; + return; + } + + if (lbr_desc->event->state < PERF_EVENT_STATE_ACTIVE) { + vmx_disable_lbr_msrs_passthrough(vcpu); + goto warn; + } else + vmx_enable_lbr_msrs_passthrough(vcpu); + + return; + +warn: + pr_warn_ratelimited("kvm: vcpu-%d: fail to passthrough LBR.\n", + vcpu->vcpu_id); +} + struct kvm_pmu_ops intel_pmu_ops = { .find_arch_event = intel_find_arch_event, .find_fixed_event = intel_find_fixed_event, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1c692d2329c5..88d7e3507156 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -667,6 +667,14 @@ static bool is_valid_passthrough_msr(u32 msr) case MSR_IA32_RTIT_CR3_MATCH: case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: /* PT MSRs. These are handled in pt_update_intercept_for_msr() */ + case MSR_LBR_SELECT: + case MSR_LBR_TOS: + case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31: + case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31: + case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31: + case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8: + case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8: + /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */ return true; }
@@ -6765,6 +6773,8 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) pt_guest_enter(vmx);
atomic_switch_perf_msrs(vmx); + if (intel_pmu_lbr_is_enabled(vcpu)) + vmx_passthrough_lbr_msrs(vcpu);
if (enable_preemption_timer) vmx_update_hv_timer(vcpu); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index f1905eeaeafb..42478237b7ab 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -100,6 +100,7 @@ bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu); bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu);
int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu); +void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu);
struct lbr_desc { /* Basic info about guest LBR records. */
From: Like Xu like.xu@linux.intel.com
mainline inclusion from mainline-v5.12-rc1 commit 9254beaafd12e27d48149fab3b16db372bc90ad7 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NP0K CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------
When the LBR records msrs has already been pass-through, there is no need to call vmx_update_intercept_for_lbr_msrs() again and again, and vice versa.
Signed-off-by: Like Xu like.xu@linux.intel.com Reviewed-by: Andi Kleen ak@linux.intel.com Message-Id: 20210201051039.255478-8-like.xu@linux.intel.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Reviewed-by: WangJian wangjian161@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/kvm/vmx/pmu_intel.c | 13 +++++++++++++ arch/x86/kvm/vmx/vmx.h | 3 +++ 2 files changed, 16 insertions(+)
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index c7dbaaccbcaa..254d9fc09863 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -557,6 +557,7 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu) vcpu->arch.perf_capabilities = vmx_get_perf_capabilities(); lbr_desc->records.nr = 0; lbr_desc->event = NULL; + lbr_desc->msr_passthrough = false; }
static void intel_pmu_reset(struct kvm_vcpu *vcpu) @@ -603,12 +604,24 @@ static void vmx_update_intercept_for_lbr_msrs(struct kvm_vcpu *vcpu, bool set)
static inline void vmx_disable_lbr_msrs_passthrough(struct kvm_vcpu *vcpu) { + struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + + if (!lbr_desc->msr_passthrough) + return; + vmx_update_intercept_for_lbr_msrs(vcpu, true); + lbr_desc->msr_passthrough = false; }
static inline void vmx_enable_lbr_msrs_passthrough(struct kvm_vcpu *vcpu) { + struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + + if (lbr_desc->msr_passthrough) + return; + vmx_update_intercept_for_lbr_msrs(vcpu, false); + lbr_desc->msr_passthrough = true; }
/* diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 42478237b7ab..c0b52498e4bb 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -113,6 +113,9 @@ struct lbr_desc { * The records may be inaccurate if the host reclaims the LBR. */ struct perf_event *event; + + /* True if LBRs are marked as not intercepted in the MSR bitmap */ + bool msr_passthrough; };
/*
From: Like Xu like.xu@linux.intel.com
mainline inclusion from mainline-v5.12-rc1 commit e6209a3bef793e8fe29c873a7612023916eaa611 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NP0K CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------
The current vPMU only supports Architecture Version 2. According to Intel SDM "17.4.7 Freezing LBR and Performance Counters on PMI", if IA32_DEBUGCTL.Freeze_LBR_On_PMI = 1, the LBR is frozen on the virtual PMI and the KVM would emulate to clear the LBR bit (bit 0) in IA32_DEBUGCTL. Also, guest needs to re-enable IA32_DEBUGCTL.LBR to resume recording branches.
Signed-off-by: Like Xu like.xu@linux.intel.com Reviewed-by: Andi Kleen ak@linux.intel.com Message-Id: 20210201051039.255478-9-like.xu@linux.intel.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Reviewed-by: WangJian wangjian161@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/kvm/pmu.c | 5 ++++- arch/x86/kvm/pmu.h | 1 + arch/x86/kvm/vmx/capabilities.h | 4 +++- arch/x86/kvm/vmx/pmu_intel.c | 30 ++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/vmx.c | 2 +- 5 files changed, 39 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 67741d2a0308..405890c723a1 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -383,8 +383,11 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) { - if (lapic_in_kernel(vcpu)) + if (lapic_in_kernel(vcpu)) { + if (kvm_x86_ops.pmu_ops->deliver_pmi) + kvm_x86_ops.pmu_ops->deliver_pmi(vcpu); kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC); + } }
bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 067fef51760c..742a4e98df8c 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -39,6 +39,7 @@ struct kvm_pmu_ops { void (*refresh)(struct kvm_vcpu *vcpu); void (*init)(struct kvm_vcpu *vcpu); void (*reset)(struct kvm_vcpu *vcpu); + void (*deliver_pmi)(struct kvm_vcpu *vcpu); };
static inline u64 pmc_bitmask(struct kvm_pmc *pmc) diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index e836c86dfc15..aba629ed452f 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -21,6 +21,8 @@ extern int __read_mostly pt_mode; #define PMU_CAP_FW_WRITES (1ULL << 13) #define PMU_CAP_LBR_FMT 0x3f
+#define DEBUGCTLMSR_LBR_MASK (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) + struct nested_vmx_msrs { /* * We only store the "true" versions of the VMX capability MSRs. We @@ -384,7 +386,7 @@ static inline u64 vmx_supported_debugctl(void) u64 debugctl = 0;
if (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT) - debugctl |= DEBUGCTLMSR_LBR; + debugctl |= DEBUGCTLMSR_LBR_MASK;
return debugctl; } diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 254d9fc09863..7928d6dee0a2 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -586,6 +586,35 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu) intel_pmu_release_guest_lbr_event(vcpu); }
+/* + * Emulate LBR_On_PMI behavior for 1 < pmu.version < 4. + * + * If Freeze_LBR_On_PMI = 1, the LBR is frozen on PMI and + * the KVM emulates to clear the LBR bit (bit 0) in IA32_DEBUGCTL. + * + * Guest needs to re-enable LBR to resume branches recording. + */ +static void intel_pmu_legacy_freezing_lbrs_on_pmi(struct kvm_vcpu *vcpu) +{ + u64 data = vmcs_read64(GUEST_IA32_DEBUGCTL); + + if (data & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) { + data &= ~DEBUGCTLMSR_LBR; + vmcs_write64(GUEST_IA32_DEBUGCTL, data); + } +} + +static void intel_pmu_deliver_pmi(struct kvm_vcpu *vcpu) +{ + u8 version = vcpu_to_pmu(vcpu)->version; + + if (!intel_pmu_lbr_is_enabled(vcpu)) + return; + + if (version > 1 && version < 4) + intel_pmu_legacy_freezing_lbrs_on_pmi(vcpu); +} + static void vmx_update_intercept_for_lbr_msrs(struct kvm_vcpu *vcpu, bool set) { struct x86_pmu_lbr *lbr = vcpu_to_lbr_records(vcpu); @@ -672,4 +701,5 @@ struct kvm_pmu_ops intel_pmu_ops = { .refresh = intel_pmu_refresh, .init = intel_pmu_init, .reset = intel_pmu_reset, + .deliver_pmi = intel_pmu_deliver_pmi, }; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 88d7e3507156..e84fddd9f4c0 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1973,7 +1973,7 @@ static u64 vcpu_supported_debugctl(struct kvm_vcpu *vcpu) u64 debugctl = vmx_supported_debugctl();
if (!intel_pmu_lbr_is_enabled(vcpu)) - debugctl &= ~DEBUGCTLMSR_LBR; + debugctl &= ~DEBUGCTLMSR_LBR_MASK;
return debugctl; }
From: Like Xu like.xu@linux.intel.com
mainline inclusion from mainline-v5.12-rc1 commit 9aa4f622460f9287e57804dbeb219bfef29f04a1 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NP0K CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------
The vPMU uses GUEST_LBR_IN_USE_IDX (bit 58) in 'pmu->pmc_in_use' to indicate whether a guest LBR event is still needed by the vcpu. If the vcpu no longer accesses LBR related registers within a scheduling time slice, and the enable bit of LBR has been unset, vPMU will treat the guest LBR event as a bland event of a vPMC counter and release it as usual. Also, the pass-through state of LBR records msrs is cancelled.
Signed-off-by: Like Xu like.xu@linux.intel.com Message-Id: 20210201051039.255478-10-like.xu@linux.intel.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Reviewed-by: WangJian wangjian161@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/kvm/pmu.c | 3 +++ arch/x86/kvm/pmu.h | 1 + arch/x86/kvm/vmx/pmu_intel.c | 21 ++++++++++++++++++++- 3 files changed, 24 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 405890c723a1..136dc2f3c5d3 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -476,6 +476,9 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu) pmc_stop_counter(pmc); }
+ if (kvm_x86_ops.pmu_ops->cleanup) + kvm_x86_ops.pmu_ops->cleanup(vcpu); + bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX); }
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 742a4e98df8c..7b30bc967af3 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -40,6 +40,7 @@ struct kvm_pmu_ops { void (*init)(struct kvm_vcpu *vcpu); void (*reset)(struct kvm_vcpu *vcpu); void (*deliver_pmi)(struct kvm_vcpu *vcpu); + void (*cleanup)(struct kvm_vcpu *vcpu); };
static inline u64 pmc_bitmask(struct kvm_pmc *pmc) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 7928d6dee0a2..d1df618cb7de 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -288,8 +288,10 @@ int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu) PERF_SAMPLE_BRANCH_USER, };
- if (unlikely(lbr_desc->event)) + if (unlikely(lbr_desc->event)) { + __set_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use); return 0; + }
event = perf_event_create_kernel_counter(&attr, -1, current, NULL, NULL); @@ -300,6 +302,7 @@ int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu) } lbr_desc->event = event; pmu->event_count++; + __set_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use); return 0; }
@@ -332,9 +335,11 @@ static bool intel_pmu_handle_lbr_msrs_access(struct kvm_vcpu *vcpu, rdmsrl(index, msr_info->data); else wrmsrl(index, msr_info->data); + __set_bit(INTEL_PMC_IDX_FIXED_VLBR, vcpu_to_pmu(vcpu)->pmc_in_use); local_irq_enable(); return true; } + clear_bit(INTEL_PMC_IDX_FIXED_VLBR, vcpu_to_pmu(vcpu)->pmc_in_use); local_irq_enable();
dummy: @@ -532,6 +537,9 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) x86_perf_get_lbr(&lbr_desc->records); else lbr_desc->records.nr = 0; + + if (lbr_desc->records.nr) + bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_IDX_FIXED_VLBR, 1); }
static void intel_pmu_init(struct kvm_vcpu *vcpu) @@ -665,17 +673,21 @@ static inline void vmx_enable_lbr_msrs_passthrough(struct kvm_vcpu *vcpu) */ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu) { + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
if (!lbr_desc->event) { vmx_disable_lbr_msrs_passthrough(vcpu); if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR) goto warn; + if (test_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use)) + goto warn; return; }
if (lbr_desc->event->state < PERF_EVENT_STATE_ACTIVE) { vmx_disable_lbr_msrs_passthrough(vcpu); + __clear_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use); goto warn; } else vmx_enable_lbr_msrs_passthrough(vcpu); @@ -687,6 +699,12 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu) vcpu->vcpu_id); }
+static void intel_pmu_cleanup(struct kvm_vcpu *vcpu) +{ + if (!(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR)) + intel_pmu_release_guest_lbr_event(vcpu); +} + struct kvm_pmu_ops intel_pmu_ops = { .find_arch_event = intel_find_arch_event, .find_fixed_event = intel_find_fixed_event, @@ -702,4 +720,5 @@ struct kvm_pmu_ops intel_pmu_ops = { .init = intel_pmu_init, .reset = intel_pmu_reset, .deliver_pmi = intel_pmu_deliver_pmi, + .cleanup = intel_pmu_cleanup, };
From: Like Xu like.xu@linux.intel.com
mainline inclusion from mainline-v5.12-rc1 commit be635e34c284d08b1da7f93ddd6a2110617d15e7 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NP0K CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------
Userspace could enable guest LBR feature when the exactly supported LBR format value is initialized to the MSR_IA32_PERF_CAPABILITIES and the LBR is also compatible with vPMU version and host cpu model.
The LBR could be enabled on the guest if host perf supports LBR (checked via x86_perf_get_lbr()) and the vcpu model is compatible with the host one.
Signed-off-by: Like Xu like.xu@linux.intel.com Message-Id: 20210201051039.255478-11-like.xu@linux.intel.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Reviewed-by: WangJian wangjian161@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/kvm/vmx/capabilities.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index aba629ed452f..7c47c7f7911b 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -374,11 +374,18 @@ static inline bool vmx_pt_mode_is_host_guest(void)
static inline u64 vmx_get_perf_capabilities(void) { + u64 perf_cap = 0; + + if (boot_cpu_has(X86_FEATURE_PDCM)) + rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap); + + perf_cap &= PMU_CAP_LBR_FMT; + /* * Since counters are virtualized, KVM would support full * width counting unconditionally, even if the host lacks it. */ - return PMU_CAP_FW_WRITES; + return PMU_CAP_FW_WRITES | perf_cap; }
static inline u64 vmx_supported_debugctl(void)
From: Like Xu like.xu@linux.intel.com
mainline inclusion from mainline-v5.12-rc1 commit 67b45af946ec3148b64e6a3a1ee2ea8f79c5bc07 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4NP0K CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------
If lbr_desc->event is successfully created, the intel_pmu_create_ guest_lbr_event() will return 0, otherwise it will return -ENOENT, and then jump to LBR msrs dummy handling.
Fixes: 1b5ac3226a1a ("KVM: vmx/pmu: Pass-through LBR msrs when the guest LBR event is ACTIVE") Signed-off-by: Like Xu like.xu@linux.intel.com Message-Id: 20210223013958.1280444-1-like.xu@linux.intel.com [Add "< 0" and PTR_ERR to make the code clearer. - Paolo] Signed-off-by: Paolo Bonzini pbonzini@redhat.com Reviewed-by: WangJian wangjian161@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/kvm/vmx/pmu_intel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index d1df618cb7de..9efc1a6b8693 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -298,7 +298,7 @@ int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu) if (IS_ERR(event)) { pr_debug_ratelimited("%s: failed %ld\n", __func__, PTR_ERR(event)); - return -ENOENT; + return PTR_ERR(event); } lbr_desc->event = event; pmu->event_count++; @@ -320,7 +320,7 @@ static bool intel_pmu_handle_lbr_msrs_access(struct kvm_vcpu *vcpu, if (!intel_pmu_is_valid_lbr_msr(vcpu, index)) return false;
- if (!lbr_desc->event && !intel_pmu_create_guest_lbr_event(vcpu)) + if (!lbr_desc->event && intel_pmu_create_guest_lbr_event(vcpu) < 0) goto dummy;
/*
From: Yanling Song songyl@ramaxel.com
Ramaxel inclusion category: features bugzilla: https://gitee.com/openeuler/kernel/issues/I4JXCG CVE: NA
Changes: 1. Use bsg module to replace with ioctrl 2. Split scmd_tmout_nonpt into two parameters: scmd_tmout_vd/scmd_tmout_rawdisk 3. Return -ETIME instead of -EINVAL when command is timeout. 4. Add one module parameters: max_io_force. 5. Remove some unnecessary module parameters. 6. Report disks by the order of channel/target id. 7. Add host_reset handler. 8. Use get_unaligned_be24.
Signed-off-by: Yanling Song songyl@ramaxel.com Reviewed-by: Jiang Yuyujiang@ramaxel.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/scsi/spraid/Kconfig | 6 +- drivers/scsi/spraid/spraid.h | 142 ++- drivers/scsi/spraid/spraid_main.c | 1633 +++++++++++++++-------------- 3 files changed, 968 insertions(+), 813 deletions(-)
diff --git a/drivers/scsi/spraid/Kconfig b/drivers/scsi/spraid/Kconfig index 83962efaab07..bfbba3db8db0 100644 --- a/drivers/scsi/spraid/Kconfig +++ b/drivers/scsi/spraid/Kconfig @@ -5,7 +5,9 @@ config RAMAXEL_SPRAID tristate "Ramaxel spraid Adapter" depends on PCI && SCSI + select BLK_DEV_BSGLIB depends on ARM64 || X86_64 - default m help - This driver supports Ramaxel spraid driver. + This driver supports Ramaxel SPRxxx serial + raid controller, which has PCIE Gen4 interface + with host and supports SAS/SATA Hdd/ssd. diff --git a/drivers/scsi/spraid/spraid.h b/drivers/scsi/spraid/spraid.h index da46d8e1b4b6..983d7af2faa8 100644 --- a/drivers/scsi/spraid/spraid.h +++ b/drivers/scsi/spraid/spraid.h @@ -1,4 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2021 Ramaxel Memory Technology, Ltd */
#ifndef __SPRAID_H_ #define __SPRAID_H_ @@ -24,7 +25,7 @@ #define SENSE_SIZE(depth) ((depth) * SCSI_SENSE_BUFFERSIZE)
#define SPRAID_AQ_DEPTH 128 -#define SPRAID_NR_AEN_COMMANDS 1 +#define SPRAID_NR_AEN_COMMANDS 16 #define SPRAID_AQ_BLK_MQ_DEPTH (SPRAID_AQ_DEPTH - SPRAID_NR_AEN_COMMANDS) #define SPRAID_AQ_MQ_TAG_DEPTH (SPRAID_AQ_BLK_MQ_DEPTH - 1)
@@ -44,7 +45,7 @@
#define SMALL_POOL_SIZE 256 #define MAX_SMALL_POOL_NUM 16 -#define MAX_CMD_PER_DEV 32 +#define MAX_CMD_PER_DEV 64 #define MAX_CDB_LEN 32
#define SPRAID_UP_TO_MULTY4(x) (((x) + 4) & (~0x03)) @@ -53,7 +54,7 @@
#define PCI_VENDOR_ID_RAMAXEL_LOGIC 0x1E81
-#define SPRAID_SERVER_DEVICE_HAB_DID 0x2100 +#define SPRAID_SERVER_DEVICE_HBA_DID 0x2100 #define SPRAID_SERVER_DEVICE_RAID_DID 0x2200
#define IO_6_DEFAULT_TX_LEN 256 @@ -142,11 +143,15 @@ enum {
enum { SPRAID_AEN_DEV_CHANGED = 0x00, + SPRAID_AEN_FW_ACT_START = 0x01, SPRAID_AEN_HOST_PROBING = 0x10, };
enum { - SPRAID_AEN_TIMESYN = 0x07 + SPRAID_AEN_TIMESYN = 0x00, + SPRAID_AEN_FW_ACT_FINISH = 0x02, + SPRAID_AEN_EVENT_MIN = 0x80, + SPRAID_AEN_EVENT_MAX = 0xff, };
enum { @@ -175,6 +180,16 @@ enum spraid_state { SPRAID_DEAD, };
+enum { + SPRAID_CARD_HBA, + SPRAID_CARD_RAID, +}; + +enum spraid_cmd_type { + SPRAID_CMD_ADM, + SPRAID_CMD_IOPT, +}; + struct spraid_completion { __le32 result; union { @@ -217,8 +232,6 @@ struct spraid_dev { struct dma_pool *prp_page_pool; struct dma_pool *prp_small_pool[MAX_SMALL_POOL_NUM]; mempool_t *iod_mempool; - struct blk_mq_tag_set admin_tagset; - struct request_queue *admin_q; void __iomem *bar; u32 max_qid; u32 num_vecs; @@ -232,23 +245,27 @@ struct spraid_dev { u32 ctrl_config; u32 online_queues; u64 cap; - struct device ctrl_device; - struct cdev cdev; int instance; struct spraid_ctrl_info *ctrl_info; struct spraid_dev_info *devices;
- struct spraid_ioq_ptcmd *ioq_ptcmds; + struct spraid_cmd *adm_cmds; + struct list_head adm_cmd_list; + spinlock_t adm_cmd_lock; + + struct spraid_cmd *ioq_ptcmds; struct list_head ioq_pt_list; spinlock_t ioq_pt_lock;
- struct work_struct aen_work; struct work_struct scan_work; struct work_struct timesyn_work; struct work_struct reset_work; + struct work_struct fw_act_work;
enum spraid_state state; spinlock_t state_lock; + + struct request_queue *bsg_queue; };
struct spraid_sgl_desc { @@ -347,6 +364,35 @@ struct spraid_get_info { __u32 rsvd12[4]; };
+struct spraid_usr_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 hdid; + union { + struct { + __le16 subopcode; + __le16 rsvd1; + } info_0; + __le32 cdw2; + }; + union { + struct { + __le16 data_len; + __le16 param_len; + } info_1; + __le32 cdw3; + }; + __u64 metadata; + union spraid_data_ptr dptr; + __le32 cdw10; + __le32 cdw11; + __le32 cdw12; + __le32 cdw13; + __le32 cdw14; + __le32 cdw15; +}; + enum { SPRAID_CMD_FLAG_SGL_METABUF = (1 << 6), SPRAID_CMD_FLAG_SGL_METASEG = (1 << 7), @@ -393,6 +439,7 @@ struct spraid_admin_command { struct spraid_get_info get_info; struct spraid_abort_cmd abort; struct spraid_reset_cmd reset; + struct spraid_usr_cmd usr_cmd; }; };
@@ -456,9 +503,6 @@ struct spraid_ioq_command { }; };
-#define SPRAID_IOCTL_RESET_CMD _IOWR('N', 0x80, struct spraid_passthru_common_cmd) -#define SPRAID_IOCTL_ADMIN_CMD _IOWR('N', 0x41, struct spraid_passthru_common_cmd) - struct spraid_passthru_common_cmd { __u8 opcode; __u8 flags; @@ -494,8 +538,6 @@ struct spraid_passthru_common_cmd { __u32 result1; };
-#define SPRAID_IOCTL_IOQ_CMD _IOWR('N', 0x42, struct spraid_ioq_passthru_cmd) - struct spraid_ioq_passthru_cmd { __u8 opcode; __u8 flags; @@ -560,7 +602,21 @@ struct spraid_ioq_passthru_cmd { __u32 result1; };
-struct spraid_ioq_ptcmd { +struct spraid_bsg_request { + u32 msgcode; + u32 control; + union { + struct spraid_passthru_common_cmd admcmd; + struct spraid_ioq_passthru_cmd ioqcmd; + }; +}; + +enum { + SPRAID_BSG_ADM, + SPRAID_BSG_IOQ, +}; + +struct spraid_cmd { int qid; int cid; u32 result0; @@ -572,14 +628,6 @@ struct spraid_ioq_ptcmd { struct list_head list; };
-struct spraid_admin_request { - struct spraid_admin_command *cmd; - u32 result0; - u32 result1; - u16 flags; - u16 status; -}; - struct spraid_queue { struct spraid_dev *hdev; spinlock_t sq_lock; /* spinlock for lock handling */ @@ -607,7 +655,6 @@ struct spraid_queue { };
struct spraid_iod { - struct spraid_admin_request req; struct spraid_queue *spraidq; enum spraid_cmd_state state; int npages; @@ -623,13 +670,51 @@ struct spraid_iod { };
#define SPRAID_DEV_INFO_ATTR_BOOT(attr) ((attr) & 0x01) -#define SPRAID_DEV_INFO_ATTR_HDD(attr) ((attr) & 0x02) +#define SPRAID_DEV_INFO_ATTR_VD(attr) (((attr) & 0x02) == 0x0) #define SPRAID_DEV_INFO_ATTR_PT(attr) (((attr) & 0x22) == 0x02) #define SPRAID_DEV_INFO_ATTR_RAWDISK(attr) ((attr) & 0x20)
#define SPRAID_DEV_INFO_FLAG_VALID(flag) ((flag) & 0x01) #define SPRAID_DEV_INFO_FLAG_CHANGE(flag) ((flag) & 0x02)
+#define BGTASK_TYPE_REBUILD 4 +#define USR_CMD_READ 0xc2 +#define USR_CMD_RDLEN 0x1000 +#define USR_CMD_VDINFO 0x704 +#define USR_CMD_BGTASK 0x504 +#define VDINFO_PARAM_LEN 0x04 + +struct spraid_vd_info { + __u8 name[32]; + __le16 id; + __u8 rg_id; + __u8 rg_level; + __u8 sg_num; + __u8 sg_disk_num; + __u8 vd_status; + __u8 vd_type; + __u8 rsvd1[4056]; +}; + +#define MAX_REALTIME_BGTASK_NUM 32 + +struct bgtask_info { + __u8 type; + __u8 progress; + __u8 rate; + __u8 rsvd0; + __le16 vd_id; + __le16 time_left; + __u8 rsvd1[4]; +}; + +struct spraid_bgtask { + __u8 sw; + __u8 task_num; + __u8 rsvd[6]; + struct bgtask_info bgtask[MAX_REALTIME_BGTASK_NUM]; +}; + struct spraid_dev_info { __le32 hdid; __le16 target; @@ -649,6 +734,11 @@ struct spraid_dev_list {
struct spraid_sdev_hostdata { u32 hdid; + u16 max_io_kb; + u8 attr; + u8 flag; + u8 rg_id; + u8 rsvd[3]; };
#endif diff --git a/drivers/scsi/spraid/spraid_main.c b/drivers/scsi/spraid/spraid_main.c index a0a75ecb0027..c6d2a0b8e35e 100644 --- a/drivers/scsi/spraid/spraid_main.c +++ b/drivers/scsi/spraid/spraid_main.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -/* - * Linux spraid device driver - * Copyright(c) 2021 Ramaxel Memory Technology, Ltd - */ +/* Copyright(c) 2021 Ramaxel Memory Technology, Ltd */ + +/* Ramaxel Raid SPXXX Series Linux Driver */ + #define pr_fmt(fmt) "spraid: " fmt
#include <linux/sched/signal.h> @@ -23,6 +23,9 @@ #include <linux/debugfs.h> #include <linux/io-64-nonatomic-lo-hi.h> #include <linux/blkdev.h> +#include <linux/bsg-lib.h> +#include <asm/unaligned.h> +#include <linux/sort.h>
#include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> @@ -31,27 +34,24 @@ #include <scsi/scsi_transport.h> #include <scsi/scsi_dbg.h>
+ #include "spraid.h"
static u32 admin_tmout = 60; module_param(admin_tmout, uint, 0644); MODULE_PARM_DESC(admin_tmout, "admin commands timeout (seconds)");
-static u32 scmd_tmout_pt = 30; -module_param(scmd_tmout_pt, uint, 0644); -MODULE_PARM_DESC(scmd_tmout_pt, "scsi commands timeout for passthrough(seconds)"); +static u32 scmd_tmout_rawdisk = 180; +module_param(scmd_tmout_rawdisk, uint, 0644); +MODULE_PARM_DESC(scmd_tmout_rawdisk, "scsi commands timeout for rawdisk(seconds)");
-static u32 scmd_tmout_nonpt = 180; -module_param(scmd_tmout_nonpt, uint, 0644); -MODULE_PARM_DESC(scmd_tmout_nonpt, "scsi commands timeout for rawdisk&raid(seconds)"); +static u32 scmd_tmout_vd = 180; +module_param(scmd_tmout_vd, uint, 0644); +MODULE_PARM_DESC(scmd_tmout_vd, "scsi commands timeout for vd(seconds)");
-static u32 wait_abl_tmout = 3; -module_param(wait_abl_tmout, uint, 0644); -MODULE_PARM_DESC(wait_abl_tmout, "wait abnormal io timeout(seconds)"); - -static bool use_sgl_force; -module_param(use_sgl_force, bool, 0644); -MODULE_PARM_DESC(use_sgl_force, "force IO use sgl format, default false"); +static bool max_io_force; +module_param(max_io_force, bool, 0644); +MODULE_PARM_DESC(max_io_force, "force max_hw_sectors_kb = 1024, default false(performance first)");
static int ioq_depth_set(const char *val, const struct kernel_param *kp); static const struct kernel_param_ops ioq_depth_ops = { @@ -106,16 +106,20 @@ static const struct kernel_param_ops small_pool_num_ops = { .get = param_get_byte, };
+/* It was found that the spindlock of a single pool conflicts + * a lot with multiple CPUs.So multiple pools are introduced + * to reduce the conflictions. + */ static unsigned char small_pool_num = 4; module_param_cb(small_pool_num, &small_pool_num_ops, &small_pool_num, 0644); MODULE_PARM_DESC(small_pool_num, "set prp small pool num, default 4, MAX 16");
static void spraid_free_queue(struct spraid_queue *spraidq); static void spraid_handle_aen_notice(struct spraid_dev *hdev, u32 result); -static void spraid_handle_aen_vs(struct spraid_dev *hdev, u32 result); +static void spraid_handle_aen_vs(struct spraid_dev *hdev, u32 result, u32 result1);
static DEFINE_IDA(spraid_instance_ida); -static dev_t spraid_chr_devt; + static struct class *spraid_class;
#define SPRAID_CAP_TIMEOUT_UNIT_MS (HZ / 2) @@ -131,9 +135,8 @@ static struct workqueue_struct *spraid_wq; #define SPRAID_DRV_VERSION "1.0.0.0"
#define ADMIN_TIMEOUT (admin_tmout * HZ) -#define ADMIN_ERR_TIMEOUT 32757
-#define SPRAID_WAIT_ABNL_CMD_TIMEOUT (wait_abl_tmout * 2) +#define SPRAID_WAIT_ABNL_CMD_TIMEOUT (3 * 2)
#define SPRAID_DMA_MSK_BIT_MAX 64
@@ -147,6 +150,13 @@ enum FW_STAT_CODE { FW_STAT_NEED_RETRY };
+static const char * const raid_levels[] = {"0", "1", "5", "6", "10", "50", "60", "NA"}; + +static const char * const raid_states[] = { + "NA", "NORMAL", "FAULT", "DEGRADE", "NOT_FORMATTED", "FORMATTING", "SANITIZING", + "INITIALIZING", "INITIALIZE_FAIL", "DELETING", "DELETE_FAIL", "WRITE_PROTECT" +}; + static int ioq_depth_set(const char *val, const struct kernel_param *kp) { int n = 0; @@ -231,12 +241,6 @@ static int spraid_pci_enable(struct spraid_dev *hdev) goto disable; }
- ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES); - if (ret < 0) { - dev_err(hdev->dev, "Allocate one IRQ for setup admin channel failed\n"); - goto disable; - } - hdev->cap = lo_hi_readq(hdev->bar + SPRAID_REG_CAP); hdev->ioq_depth = min_t(u32, SPRAID_CAP_MQES(hdev->cap) + 1, io_queue_depth); hdev->db_stride = 1 << SPRAID_CAP_STRIDE(hdev->cap); @@ -246,13 +250,20 @@ static int spraid_pci_enable(struct spraid_dev *hdev) dev_err(hdev->dev, "err, dma mask invalid[%llu], set to default\n", maskbit); maskbit = SPRAID_DMA_MSK_BIT_MAX; } - if (dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(maskbit))) { + if (dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(maskbit)) && + dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32))) { dev_err(hdev->dev, "set dma mask and coherent failed\n"); goto disable; }
dev_info(hdev->dev, "set dma mask[%llu] success\n", maskbit);
+ ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES); + if (ret < 0) { + dev_err(hdev->dev, "Allocate one IRQ for setup admin channel failed\n"); + goto disable; + } + pci_enable_pcie_error_reporting(pdev); pci_save_state(pdev);
@@ -263,12 +274,6 @@ static int spraid_pci_enable(struct spraid_dev *hdev) return ret; }
-static inline -struct spraid_admin_request *spraid_admin_req(struct request *req) -{ - return blk_mq_rq_to_pdu(req); -} - static int spraid_npages_prp(u32 size, struct spraid_dev *hdev) { u32 nprps = DIV_ROUND_UP(size + hdev->page_size, hdev->page_size); @@ -419,7 +424,7 @@ static void spraid_submit_cmd(struct spraid_queue *spraidq, const void *cmd) writel(spraidq->sq_tail, spraidq->q_db); spin_unlock_irqrestore(&spraidq->sq_lock, flags);
- dev_log_dbg(spraidq->hdev->dev, "cid[%d], qid[%d], opcode[0x%x], flags[0x%x], hdid[%u]\n", + dev_log_dbg(spraidq->hdev->dev, "cid[%d] qid[%d], opcode[0x%x], flags[0x%x], hdid[%u]\n", acd->command_id, spraidq->qid, acd->opcode, acd->flags, le32_to_cpu(acd->hdid)); }
@@ -605,18 +610,15 @@ static void spraid_setup_rw_cmd(struct spraid_dev *hdev, if (scmd->cmd_len == 6) { datalength = (u32)(scmd->cmnd[4] == 0 ? IO_6_DEFAULT_TX_LEN : scmd->cmnd[4]); - start_lba_lo = ((u32)scmd->cmnd[1] << 16) | - ((u32)scmd->cmnd[2] << 8) | (u32)scmd->cmnd[3]; + start_lba_lo = (u32)get_unaligned_be24(&scmd->cmnd[1]);
start_lba_lo &= 0x1FFFFF; }
/* 10-byte READ(0x28) or WRITE(0x2A) cdb */ else if (scmd->cmd_len == 10) { - datalength = (u32)scmd->cmnd[8] | ((u32)scmd->cmnd[7] << 8); - start_lba_lo = ((u32)scmd->cmnd[2] << 24) | - ((u32)scmd->cmnd[3] << 16) | - ((u32)scmd->cmnd[4] << 8) | (u32)scmd->cmnd[5]; + datalength = (u32)get_unaligned_be16(&scmd->cmnd[7]); + start_lba_lo = get_unaligned_be32(&scmd->cmnd[2]);
if (scmd->cmnd[1] & FUA_MASK) control |= SPRAID_RW_FUA; @@ -624,42 +626,26 @@ static void spraid_setup_rw_cmd(struct spraid_dev *hdev,
/* 12-byte READ(0xA8) or WRITE(0xAA) cdb */ else if (scmd->cmd_len == 12) { - datalength = ((u32)scmd->cmnd[6] << 24) | - ((u32)scmd->cmnd[7] << 16) | - ((u32)scmd->cmnd[8] << 8) | (u32)scmd->cmnd[9]; - start_lba_lo = ((u32)scmd->cmnd[2] << 24) | - ((u32)scmd->cmnd[3] << 16) | - ((u32)scmd->cmnd[4] << 8) | (u32)scmd->cmnd[5]; + datalength = get_unaligned_be32(&scmd->cmnd[6]); + start_lba_lo = get_unaligned_be32(&scmd->cmnd[2]);
if (scmd->cmnd[1] & FUA_MASK) control |= SPRAID_RW_FUA; } /* 16-byte READ(0x88) or WRITE(0x8A) cdb */ else if (scmd->cmd_len == 16) { - datalength = ((u32)scmd->cmnd[10] << 24) | - ((u32)scmd->cmnd[11] << 16) | - ((u32)scmd->cmnd[12] << 8) | (u32)scmd->cmnd[13]; - start_lba_lo = ((u32)scmd->cmnd[6] << 24) | - ((u32)scmd->cmnd[7] << 16) | - ((u32)scmd->cmnd[8] << 8) | (u32)scmd->cmnd[9]; - start_lba_hi = ((u32)scmd->cmnd[2] << 24) | - ((u32)scmd->cmnd[3] << 16) | - ((u32)scmd->cmnd[4] << 8) | (u32)scmd->cmnd[5]; + datalength = get_unaligned_be32(&scmd->cmnd[10]); + start_lba_lo = get_unaligned_be32(&scmd->cmnd[6]); + start_lba_hi = get_unaligned_be32(&scmd->cmnd[2]);
if (scmd->cmnd[1] & FUA_MASK) control |= SPRAID_RW_FUA; } /* 32-byte READ(0x88) or WRITE(0x8A) cdb */ else if (scmd->cmd_len == 32) { - datalength = ((u32)scmd->cmnd[28] << 24) | - ((u32)scmd->cmnd[29] << 16) | - ((u32)scmd->cmnd[30] << 8) | (u32)scmd->cmnd[31]; - start_lba_lo = ((u32)scmd->cmnd[16] << 24) | - ((u32)scmd->cmnd[17] << 16) | - ((u32)scmd->cmnd[18] << 8) | (u32)scmd->cmnd[19]; - start_lba_hi = ((u32)scmd->cmnd[12] << 24) | - ((u32)scmd->cmnd[13] << 16) | - ((u32)scmd->cmnd[14] << 8) | (u32)scmd->cmnd[15]; + datalength = get_unaligned_be32(&scmd->cmnd[28]); + start_lba_lo = get_unaligned_be32(&scmd->cmnd[16]); + start_lba_hi = get_unaligned_be32(&scmd->cmnd[12]);
if (scmd->cmnd[10] & FUA_MASK) control |= SPRAID_RW_FUA; @@ -814,7 +800,7 @@ static void spraid_map_status(struct spraid_iod *iod, struct scsi_cmnd *scmd, if (scmd->result & SAM_STAT_CHECK_CONDITION) { memset(scmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); memcpy(scmd->sense_buffer, iod->sense, SCSI_SENSE_BUFFERSIZE); - set_driver_byte(scmd, DRIVER_SENSE); + scmd->result = (scmd->result & 0x00ffffff) | (DRIVER_SENSE << 24); } break; case FW_STAT_ABORTED: @@ -825,6 +811,8 @@ static void spraid_map_status(struct spraid_iod *iod, struct scsi_cmnd *scmd, break; default: set_host_byte(scmd, DID_BAD_TARGET); + dev_warn(iod->spraidq->hdev->dev, "[%s] cid[%d] qid[%d] bad status[0x%x]\n", + __func__, cqe->cmd_id, le16_to_cpu(cqe->sq_id), le16_to_cpu(cqe->status)); break; } } @@ -850,14 +838,13 @@ static int spraid_queue_command(struct Scsi_Host *shost, struct scsi_cmnd *scmd) int ret;
if (unlikely(!scmd)) { - dev_err(hdev->dev, "err, scmd is null, return 0\n"); + dev_err(hdev->dev, "err, scmd is null\n"); return 0; }
if (unlikely(hdev->state != SPRAID_LIVE)) { set_host_byte(scmd, DID_NO_CONNECT); scmd->scsi_done(scmd); - dev_err(hdev->dev, "[%s] err, hdev state is not live\n", __func__); return 0; }
@@ -894,7 +881,7 @@ static int spraid_queue_command(struct Scsi_Host *shost, struct scsi_cmnd *scmd) WRITE_ONCE(iod->state, SPRAID_CMD_IN_FLIGHT); spraid_submit_cmd(ioq, &ioq_cmd); elapsed = jiffies - scmd->jiffies_at_alloc; - dev_log_dbg(hdev->dev, "cid[%d], qid[%d] submit IO cost %3ld.%3ld seconds\n", + dev_log_dbg(hdev->dev, "cid[%d] qid[%d] submit IO cost %3ld.%3ld seconds\n", cid, hwq, elapsed / HZ, elapsed % HZ); return 0;
@@ -945,6 +932,10 @@ static int spraid_slave_alloc(struct scsi_device *sdev)
scan_host: hostdata->hdid = le32_to_cpu(hdev->devices[idx].hdid); + hostdata->max_io_kb = le16_to_cpu(hdev->devices[idx].max_io_kb); + hostdata->attr = hdev->devices[idx].attr; + hostdata->flag = hdev->devices[idx].flag; + hostdata->rg_id = 0xff; sdev->hostdata = hostdata; up_read(&hdev->devices_rwsem); return 0; @@ -958,30 +949,17 @@ static void spraid_slave_destroy(struct scsi_device *sdev)
static int spraid_slave_configure(struct scsi_device *sdev) { - u16 idx; - unsigned int timeout = scmd_tmout_nonpt * HZ; + unsigned int timeout = scmd_tmout_rawdisk * HZ; struct spraid_dev *hdev = shost_priv(sdev->host); struct spraid_sdev_hostdata *hostdata = sdev->hostdata; u32 max_sec = sdev->host->max_sectors;
- if (!hostdata) { - idx = hostdata->hdid - 1; - if (sdev->channel == hdev->devices[idx].channel && - sdev->id == le16_to_cpu(hdev->devices[idx].target) && - sdev->lun < hdev->devices[idx].lun) { - if (SPRAID_DEV_INFO_ATTR_PT(hdev->devices[idx].attr)) - timeout = scmd_tmout_pt * HZ; - else - timeout = scmd_tmout_nonpt * HZ; - max_sec = le16_to_cpu(hdev->devices[idx].max_io_kb) << 1; - } else { - dev_err(hdev->dev, "[%s] err, sdev->channel:id:lun[%d:%d:%lld];" - "devices[%d], channel:target:lun[%d:%d:%d]\n", - __func__, sdev->channel, sdev->id, sdev->lun, - idx, hdev->devices[idx].channel, - hdev->devices[idx].target, - hdev->devices[idx].lun); - } + if (hostdata) { + if (SPRAID_DEV_INFO_ATTR_VD(hostdata->attr)) + timeout = scmd_tmout_vd * HZ; + else if (SPRAID_DEV_INFO_ATTR_RAWDISK(hostdata->attr)) + timeout = scmd_tmout_rawdisk * HZ; + max_sec = hostdata->max_io_kb << 1; } else { dev_err(hdev->dev, "[%s] err, sdev->hostdata is null\n", __func__); } @@ -991,7 +969,9 @@ static int spraid_slave_configure(struct scsi_device *sdev)
if ((max_sec == 0) || (max_sec > sdev->host->max_sectors)) max_sec = sdev->host->max_sectors; - blk_queue_max_hw_sectors(sdev->request_queue, max_sec); + + if (!max_io_force) + blk_queue_max_hw_sectors(sdev->request_queue, max_sec);
dev_info(hdev->dev, "[%s] sdev->channel:id:lun[%d:%d:%lld], scmd_timeout[%d]s, maxsec[%d]\n", __func__, sdev->channel, sdev->id, sdev->lun, timeout / HZ, max_sec); @@ -1176,6 +1156,75 @@ static inline bool spraid_cqe_pending(struct spraid_queue *spraidq) spraidq->cq_phase; }
+static void spraid_sata_report_zone_handle(struct scsi_cmnd *scmd, struct spraid_iod *iod) +{ + int i = 0; + unsigned int bytes = 0; + struct scatterlist *sg = scsi_sglist(scmd); + + scsi_for_each_sg(scmd, sg, iod->nsge, i) { + unsigned int offset = 0; + + if (bytes == 0) { + char *hdr; + u32 list_length; + u64 max_lba, opt_lba; + u16 same; + + hdr = sg_virt(sg); + + list_length = get_unaligned_le32(&hdr[0]); + same = get_unaligned_le16(&hdr[4]); + max_lba = get_unaligned_le64(&hdr[8]); + opt_lba = get_unaligned_le64(&hdr[16]); + put_unaligned_be32(list_length, &hdr[0]); + hdr[4] = same & 0xf; + put_unaligned_be64(max_lba, &hdr[8]); + put_unaligned_be64(opt_lba, &hdr[16]); + offset += 64; + bytes += 64; + } + while (offset < sg_dma_len(sg)) { + char *rec; + u8 cond, type, non_seq, reset; + u64 size, start, wp; + + rec = sg_virt(sg) + offset; + type = rec[0] & 0xf; + cond = (rec[1] >> 4) & 0xf; + non_seq = (rec[1] & 2); + reset = (rec[1] & 1); + size = get_unaligned_le64(&rec[8]); + start = get_unaligned_le64(&rec[16]); + wp = get_unaligned_le64(&rec[24]); + rec[0] = type; + rec[1] = (cond << 4) | non_seq | reset; + put_unaligned_be64(size, &rec[8]); + put_unaligned_be64(start, &rec[16]); + put_unaligned_be64(wp, &rec[24]); + WARN_ON(offset + 64 > sg_dma_len(sg)); + offset += 64; + bytes += 64; + } + } +} + +static inline void spraid_handle_ata_cmd(struct spraid_dev *hdev, struct scsi_cmnd *scmd, + struct spraid_iod *iod) +{ + if (hdev->ctrl_info->card_type != SPRAID_CARD_HBA) + return; + + switch (scmd->cmnd[0]) { + case ZBC_IN: + dev_info(hdev->dev, "[%s] process report zone\n", __func__); + spraid_sata_report_zone_handle(scmd, iod); + break; + default: + break; + } +} + static void spraid_complete_ioq_cmnd(struct spraid_queue *ioq, struct spraid_completion *cqe) { struct spraid_dev *hdev = ioq->hdev; @@ -1197,12 +1246,12 @@ static void spraid_complete_ioq_cmnd(struct spraid_queue *ioq, struct spraid_com iod = scsi_cmd_priv(scmd);
elapsed = jiffies - scmd->jiffies_at_alloc; - dev_log_dbg(hdev->dev, "cid[%d], qid[%d] finish IO cost %3ld.%3ld seconds\n", + dev_log_dbg(hdev->dev, "cid[%d] qid[%d] finish IO cost %3ld.%3ld seconds\n", cqe->cmd_id, ioq->qid, elapsed / HZ, elapsed % HZ);
if (cmpxchg(&iod->state, SPRAID_CMD_IN_FLIGHT, SPRAID_CMD_COMPLETE) != SPRAID_CMD_IN_FLIGHT) { - dev_warn(hdev->dev, "cid[%d], qid[%d] enters abnormal handler, cost %3ld.%3ld seconds\n", + dev_warn(hdev->dev, "cid[%d] qid[%d] enters abnormal handler, cost %3ld.%3ld seconds\n", cqe->cmd_id, ioq->qid, elapsed / HZ, elapsed % HZ); WRITE_ONCE(iod->state, SPRAID_CMD_TMO_COMPLETE);
@@ -1215,6 +1264,8 @@ static void spraid_complete_ioq_cmnd(struct spraid_queue *ioq, struct spraid_com return; }
+ spraid_handle_ata_cmd(hdev, scmd, iod); + spraid_map_status(iod, scmd, cqe); if (iod->nsge) { iod->nsge = 0; @@ -1224,38 +1275,36 @@ static void spraid_complete_ioq_cmnd(struct spraid_queue *ioq, struct spraid_com scmd->scsi_done(scmd); }
-static inline void spraid_end_admin_request(struct request *req, __le16 status, - __le32 result0, __le32 result1) -{ - struct spraid_admin_request *rq = spraid_admin_req(req); - - rq->status = le16_to_cpu(status) >> 1; - rq->result0 = le32_to_cpu(result0); - rq->result1 = le32_to_cpu(result1); - blk_mq_complete_request(req); -} - static void spraid_complete_adminq_cmnd(struct spraid_queue *adminq, struct spraid_completion *cqe) { - struct blk_mq_tags *tags = adminq->hdev->admin_tagset.tags[0]; - struct request *req; + struct spraid_dev *hdev = adminq->hdev; + struct spraid_cmd *adm_cmd;
- req = blk_mq_tag_to_rq(tags, cqe->cmd_id); - if (unlikely(!req)) { + adm_cmd = hdev->adm_cmds + cqe->cmd_id; + if (unlikely(adm_cmd->state == SPRAID_CMD_IDLE)) { dev_warn(adminq->hdev->dev, "Invalid id %d completed on queue %d\n", cqe->cmd_id, le16_to_cpu(cqe->sq_id)); return; } - spraid_end_admin_request(req, cqe->status, cqe->result, cqe->result1); + + adm_cmd->status = le16_to_cpu(cqe->status) >> 1; + adm_cmd->result0 = le32_to_cpu(cqe->result); + adm_cmd->result1 = le32_to_cpu(cqe->result1); + + complete(&adm_cmd->cmd_done); }
+static void spraid_send_aen(struct spraid_dev *hdev, u16 cid); + static void spraid_complete_aen(struct spraid_queue *spraidq, struct spraid_completion *cqe) { struct spraid_dev *hdev = spraidq->hdev; u32 result = le32_to_cpu(cqe->result);
- dev_info(hdev->dev, "rcv aen, status[%x], result[%x]\n", - le16_to_cpu(cqe->status) >> 1, result); + dev_info(hdev->dev, "rcv aen, cid[%d], status[0x%x], result[0x%x]\n", + cqe->cmd_id, le16_to_cpu(cqe->status) >> 1, result); + + spraid_send_aen(hdev, cqe->cmd_id);
if ((le16_to_cpu(cqe->status) >> 1) != SPRAID_SC_SUCCESS) return; @@ -1264,22 +1313,19 @@ static void spraid_complete_aen(struct spraid_queue *spraidq, struct spraid_comp spraid_handle_aen_notice(hdev, result); break; case SPRAID_AEN_VS: - spraid_handle_aen_vs(hdev, result); + spraid_handle_aen_vs(hdev, result, le32_to_cpu(cqe->result1)); break; default: dev_warn(hdev->dev, "Unsupported async event type: %u\n", result & 0x7); break; } - queue_work(spraid_wq, &hdev->aen_work); }
-static void spraid_put_ioq_ptcmd(struct spraid_dev *hdev, struct spraid_ioq_ptcmd *cmd); - static void spraid_complete_ioq_sync_cmnd(struct spraid_queue *ioq, struct spraid_completion *cqe) { struct spraid_dev *hdev = ioq->hdev; - struct spraid_ioq_ptcmd *ptcmd; + struct spraid_cmd *ptcmd;
ptcmd = hdev->ioq_ptcmds + (ioq->qid - 1) * SPRAID_PTCMDS_PERQ + cqe->cmd_id - SPRAID_IO_BLK_MQ_DEPTH; @@ -1289,8 +1335,6 @@ static void spraid_complete_ioq_sync_cmnd(struct spraid_queue *ioq, struct sprai ptcmd->result1 = le32_to_cpu(cqe->result1);
complete(&ptcmd->cmd_done); - - spraid_put_ioq_ptcmd(hdev, ptcmd); }
static inline void spraid_handle_cqe(struct spraid_queue *spraidq, u16 idx) @@ -1304,7 +1348,7 @@ static inline void spraid_handle_cqe(struct spraid_queue *spraidq, u16 idx) return; }
- dev_log_dbg(hdev->dev, "cid[%d], qid[%d], result[0x%x], sq_id[%d], status[0x%x]\n", + dev_log_dbg(hdev->dev, "cid[%d] qid[%d], result[0x%x], sq_id[%d], status[0x%x]\n", cqe->cmd_id, spraidq->qid, le32_to_cpu(cqe->result), le16_to_cpu(cqe->sq_id), le16_to_cpu(cqe->status));
@@ -1452,62 +1496,119 @@ static u32 spraid_bar_size(struct spraid_dev *hdev, u32 nr_ioqs) return (SPRAID_REG_DBS + ((nr_ioqs + 1) * 8 * hdev->db_stride)); }
-static inline void spraid_clear_spraid_request(struct request *req) +static int spraid_alloc_admin_cmds(struct spraid_dev *hdev) { - if (!(req->rq_flags & RQF_DONTPREP)) { - spraid_admin_req(req)->flags = 0; - req->rq_flags |= RQF_DONTPREP; + int i; + + INIT_LIST_HEAD(&hdev->adm_cmd_list); + spin_lock_init(&hdev->adm_cmd_lock); + + hdev->adm_cmds = kcalloc_node(SPRAID_AQ_BLK_MQ_DEPTH, sizeof(struct spraid_cmd), + GFP_KERNEL, hdev->numa_node); + + if (!hdev->adm_cmds) { + dev_err(hdev->dev, "Alloc admin cmds failed\n"); + return -ENOMEM; + } + + for (i = 0; i < SPRAID_AQ_BLK_MQ_DEPTH; i++) { + hdev->adm_cmds[i].qid = 0; + hdev->adm_cmds[i].cid = i; + list_add_tail(&(hdev->adm_cmds[i].list), &hdev->adm_cmd_list); } + + dev_info(hdev->dev, "Alloc admin cmds success, num[%d]\n", SPRAID_AQ_BLK_MQ_DEPTH); + + return 0; }
-static struct request *spraid_alloc_admin_request(struct request_queue *q, - struct spraid_admin_command *cmd, - blk_mq_req_flags_t flags) +static void spraid_free_admin_cmds(struct spraid_dev *hdev) { - u32 op = COMMAND_IS_WRITE(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN; - struct request *req; + kfree(hdev->adm_cmds); + hdev->adm_cmds = NULL; + INIT_LIST_HEAD(&hdev->adm_cmd_list); +} + +static struct spraid_cmd *spraid_get_cmd(struct spraid_dev *hdev, enum spraid_cmd_type type) +{ + struct spraid_cmd *cmd = NULL; + unsigned long flags; + struct list_head *head = &hdev->adm_cmd_list; + spinlock_t *slock = &hdev->adm_cmd_lock; + + if (type == SPRAID_CMD_IOPT) { + head = &hdev->ioq_pt_list; + slock = &hdev->ioq_pt_lock; + } + + spin_lock_irqsave(slock, flags); + if (list_empty(head)) { + spin_unlock_irqrestore(slock, flags); + dev_err(hdev->dev, "err, cmd[%d] list empty\n", type); + return NULL; + } + cmd = list_entry(head->next, struct spraid_cmd, list); + list_del_init(&cmd->list); + spin_unlock_irqrestore(slock, flags);
- req = blk_mq_alloc_request(q, op, flags); - if (IS_ERR(req)) - return req; - req->cmd_flags |= REQ_FAILFAST_DRIVER; - spraid_clear_spraid_request(req); - spraid_admin_req(req)->cmd = cmd; + WRITE_ONCE(cmd->state, SPRAID_CMD_IN_FLIGHT);
- return req; + return cmd; }
-static int spraid_submit_admin_sync_cmd(struct request_queue *q, - struct spraid_admin_command *cmd, - u32 *result, void *buffer, - u32 bufflen, u32 timeout, int at_head, blk_mq_req_flags_t flags) +static void spraid_put_cmd(struct spraid_dev *hdev, struct spraid_cmd *cmd, + enum spraid_cmd_type type) { - struct request *req; - int ret; + unsigned long flags; + struct list_head *head = &hdev->adm_cmd_list; + spinlock_t *slock = &hdev->adm_cmd_lock;
- req = spraid_alloc_admin_request(q, cmd, flags); - if (IS_ERR(req)) - return PTR_ERR(req); + if (type == SPRAID_CMD_IOPT) { + head = &hdev->ioq_pt_list; + slock = &hdev->ioq_pt_lock; + }
- req->timeout = timeout ? timeout : ADMIN_TIMEOUT; - if (buffer && bufflen) { - ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL); - if (ret) - goto out; + spin_lock_irqsave(slock, flags); + WRITE_ONCE(cmd->state, SPRAID_CMD_IDLE); + list_add_tail(&cmd->list, head); + spin_unlock_irqrestore(slock, flags); +} + + +static int spraid_submit_admin_sync_cmd(struct spraid_dev *hdev, struct spraid_admin_command *cmd, + u32 *result0, u32 *result1, u32 timeout) +{ + struct spraid_cmd *adm_cmd = spraid_get_cmd(hdev, SPRAID_CMD_ADM); + + if (!adm_cmd) { + dev_err(hdev->dev, "err, get admin cmd failed\n"); + return -EFAULT; } - blk_execute_rq(req->q, NULL, req, at_head);
- if (result) - *result = spraid_admin_req(req)->result0; + timeout = timeout ? timeout : ADMIN_TIMEOUT;
- if (spraid_admin_req(req)->flags & SPRAID_REQ_CANCELLED) - ret = -EINTR; - else - ret = spraid_admin_req(req)->status; + init_completion(&adm_cmd->cmd_done);
-out: - blk_mq_free_request(req); - return ret; + cmd->common.command_id = adm_cmd->cid; + spraid_submit_cmd(&hdev->queues[0], cmd); + + if (!wait_for_completion_timeout(&adm_cmd->cmd_done, timeout)) { + dev_err(hdev->dev, "[%s] cid[%d] qid[%d] timeout, opcode[0x%x] subopcode[0x%x]\n", + __func__, adm_cmd->cid, adm_cmd->qid, cmd->usr_cmd.opcode, + cmd->usr_cmd.info_0.subopcode); + WRITE_ONCE(adm_cmd->state, SPRAID_CMD_TIMEOUT); + spraid_put_cmd(hdev, adm_cmd, SPRAID_CMD_ADM); + return -ETIME; + } + + if (result0) + *result0 = adm_cmd->result0; + if (result1) + *result1 = adm_cmd->result1; + + spraid_put_cmd(hdev, adm_cmd, SPRAID_CMD_ADM); + + return adm_cmd->status; }
static int spraid_create_cq(struct spraid_dev *hdev, u16 qid, @@ -1524,8 +1625,7 @@ static int spraid_create_cq(struct spraid_dev *hdev, u16 qid, admin_cmd.create_cq.cq_flags = cpu_to_le16(flags); admin_cmd.create_cq.irq_vector = cpu_to_le16(cq_vector);
- return spraid_submit_admin_sync_cmd(hdev->admin_q, &admin_cmd, NULL, - NULL, 0, 0, 0, 0); + return spraid_submit_admin_sync_cmd(hdev, &admin_cmd, NULL, NULL, 0); }
static int spraid_create_sq(struct spraid_dev *hdev, u16 qid, @@ -1542,8 +1642,7 @@ static int spraid_create_sq(struct spraid_dev *hdev, u16 qid, admin_cmd.create_sq.sq_flags = cpu_to_le16(flags); admin_cmd.create_sq.cqid = cpu_to_le16(qid);
- return spraid_submit_admin_sync_cmd(hdev->admin_q, &admin_cmd, NULL, - NULL, 0, 0, 0, 0); + return spraid_submit_admin_sync_cmd(hdev, &admin_cmd, NULL, NULL, 0); }
static void spraid_free_queue(struct spraid_queue *spraidq) @@ -1581,8 +1680,7 @@ static int spraid_delete_queue(struct spraid_dev *hdev, u8 op, u16 id) admin_cmd.delete_queue.opcode = op; admin_cmd.delete_queue.qid = cpu_to_le16(id);
- ret = spraid_submit_admin_sync_cmd(hdev->admin_q, &admin_cmd, NULL, - NULL, 0, 0, 0, 0); + ret = spraid_submit_admin_sync_cmd(hdev, &admin_cmd, NULL, NULL, 0);
if (ret) dev_err(hdev->dev, "Delete %s:[%d] failed\n", @@ -1663,19 +1761,28 @@ static int spraid_set_features(struct spraid_dev *hdev, u32 fid, u32 dword11, vo size_t buflen, u32 *result) { struct spraid_admin_command admin_cmd; - u32 res; int ret; + u8 *data_ptr = NULL; + dma_addr_t data_dma = 0; + + if (buffer && buflen) { + data_ptr = dma_alloc_coherent(hdev->dev, buflen, &data_dma, GFP_KERNEL); + if (!data_ptr) + return -ENOMEM; + + memcpy(data_ptr, buffer, buflen); + }
memset(&admin_cmd, 0, sizeof(admin_cmd)); admin_cmd.features.opcode = SPRAID_ADMIN_SET_FEATURES; admin_cmd.features.fid = cpu_to_le32(fid); admin_cmd.features.dword11 = cpu_to_le32(dword11); + admin_cmd.common.dptr.prp1 = cpu_to_le64(data_dma);
- ret = spraid_submit_admin_sync_cmd(hdev->admin_q, &admin_cmd, &res, - buffer, buflen, 0, 0, 0); + ret = spraid_submit_admin_sync_cmd(hdev, &admin_cmd, result, NULL, 0);
- if (!ret && result) - *result = res; + if (data_ptr) + dma_free_coherent(hdev->dev, buflen, data_ptr, data_dma);
return ret; } @@ -1764,8 +1871,7 @@ static int spraid_setup_io_queues(struct spraid_dev *hdev) break; } dev_info(hdev->dev, "[%s] max_qid: %d, queue_count: %d, online_queue: %d, ioq_depth: %d\n", - __func__, hdev->max_qid, hdev->queue_count, - hdev->online_queues, hdev->ioq_depth); + __func__, hdev->max_qid, hdev->queue_count, hdev->online_queues, hdev->ioq_depth);
return spraid_create_io_queues(hdev); } @@ -1889,10 +1995,11 @@ static int spraid_get_dev_list(struct spraid_dev *hdev, struct spraid_dev_info * u32 nd = le32_to_cpu(hdev->ctrl_info->nd); struct spraid_admin_command admin_cmd; struct spraid_dev_list *list_buf; + dma_addr_t data_dma = 0; u32 i, idx, hdid, ndev; int ret = 0;
- list_buf = kmalloc(sizeof(*list_buf), GFP_KERNEL); + list_buf = dma_alloc_coherent(hdev->dev, PAGE_SIZE, &data_dma, GFP_KERNEL); if (!list_buf) return -ENOMEM;
@@ -1901,9 +2008,9 @@ static int spraid_get_dev_list(struct spraid_dev *hdev, struct spraid_dev_info * admin_cmd.get_info.opcode = SPRAID_ADMIN_GET_INFO; admin_cmd.get_info.type = SPRAID_GET_INFO_DEV_LIST; admin_cmd.get_info.cdw11 = cpu_to_le32(idx); + admin_cmd.common.dptr.prp1 = cpu_to_le64(data_dma);
- ret = spraid_submit_admin_sync_cmd(hdev->admin_q, &admin_cmd, NULL, list_buf, - sizeof(*list_buf), 0, 0, 0); + ret = spraid_submit_admin_sync_cmd(hdev, &admin_cmd, NULL, NULL, 0);
if (ret) { dev_err(hdev->dev, "Get device list failed, nd: %u, idx: %u, ret: %d\n", @@ -1916,12 +2023,11 @@ static int spraid_get_dev_list(struct spraid_dev *hdev, struct spraid_dev_info *
for (i = 0; i < ndev; i++) { hdid = le32_to_cpu(list_buf->devices[i].hdid); - dev_info(hdev->dev, "list_buf->devices[%d], hdid: %u target: %d, channel: %d, lun: %d, attr[%x]\n", - i, hdid, - le16_to_cpu(list_buf->devices[i].target), - list_buf->devices[i].channel, - list_buf->devices[i].lun, - list_buf->devices[i].attr); + dev_info(hdev->dev, "list_buf->devices[%d], hdid: %u target: %d, channel: %d, lun: %d, attr[0x%x]\n", + i, hdid, le16_to_cpu(list_buf->devices[i].target), + list_buf->devices[i].channel, + list_buf->devices[i].lun, + list_buf->devices[i].attr); if (hdid > nd || hdid == 0) { dev_err(hdev->dev, "err, hdid[%d] invalid\n", hdid); continue; @@ -1936,21 +2042,29 @@ static int spraid_get_dev_list(struct spraid_dev *hdev, struct spraid_dev_info * }
out: - kfree(list_buf); + dma_free_coherent(hdev->dev, PAGE_SIZE, list_buf, data_dma); return ret; }
-static void spraid_send_aen(struct spraid_dev *hdev) +static void spraid_send_aen(struct spraid_dev *hdev, u16 cid) { struct spraid_queue *adminq = &hdev->queues[0]; struct spraid_admin_command admin_cmd;
memset(&admin_cmd, 0, sizeof(admin_cmd)); admin_cmd.common.opcode = SPRAID_ADMIN_ASYNC_EVENT; - admin_cmd.common.command_id = SPRAID_AQ_BLK_MQ_DEPTH; + admin_cmd.common.command_id = cid;
spraid_submit_cmd(adminq, &admin_cmd); - dev_info(hdev->dev, "send aen, cid[%d]\n", SPRAID_AQ_BLK_MQ_DEPTH); + dev_info(hdev->dev, "send aen, cid[%d]\n", cid); +} + +static inline void spraid_send_all_aen(struct spraid_dev *hdev) +{ + u16 i; + + for (i = 0; i < hdev->ctrl_info->aerl; i++) + spraid_send_aen(hdev, i + SPRAID_AQ_BLK_MQ_DEPTH); }
static int spraid_add_device(struct spraid_dev *hdev, struct spraid_dev_info *device) @@ -1958,6 +2072,10 @@ static int spraid_add_device(struct spraid_dev *hdev, struct spraid_dev_info *de struct Scsi_Host *shost = hdev->shost; struct scsi_device *sdev;
+ dev_info(hdev->dev, "add device, hdid: %u target: %d, channel: %d, lun: %d, attr[0x%x]\n", + le32_to_cpu(device->hdid), le16_to_cpu(device->target), + device->channel, device->lun, device->attr); + sdev = scsi_device_lookup(shost, device->channel, le16_to_cpu(device->target), 0); if (sdev) { dev_warn(hdev->dev, "Device is already exist, channel: %d, target_id: %d, lun: %d\n", @@ -1974,9 +2092,13 @@ static int spraid_rescan_device(struct spraid_dev *hdev, struct spraid_dev_info struct Scsi_Host *shost = hdev->shost; struct scsi_device *sdev;
+ dev_info(hdev->dev, "rescan device, hdid: %u target: %d, channel: %d, lun: %d, attr[0x%x]\n", + le32_to_cpu(device->hdid), le16_to_cpu(device->target), + device->channel, device->lun, device->attr); + sdev = scsi_device_lookup(shost, device->channel, le16_to_cpu(device->target), 0); if (!sdev) { - dev_warn(hdev->dev, "Device is not exit, channel: %d, target_id: %d, lun: %d\n", + dev_warn(hdev->dev, "device is not exit rescan it, channel: %d, target_id: %d, lun: %d\n", device->channel, le16_to_cpu(device->target), 0); return -ENODEV; } @@ -1991,9 +2113,13 @@ static int spraid_remove_device(struct spraid_dev *hdev, struct spraid_dev_info struct Scsi_Host *shost = hdev->shost; struct scsi_device *sdev;
+ dev_info(hdev->dev, "remove device, hdid: %u target: %d, channel: %d, lun: %d, attr[0x%x]\n", + le32_to_cpu(org_device->hdid), le16_to_cpu(org_device->target), + org_device->channel, org_device->lun, org_device->attr); + sdev = scsi_device_lookup(shost, org_device->channel, le16_to_cpu(org_device->target), 0); if (!sdev) { - dev_warn(hdev->dev, "Device is not exit, channel: %d, target_id: %d, lun: %d\n", + dev_warn(hdev->dev, "device is not exit remove it, channel: %d, target_id: %d, lun: %d\n", org_device->channel, le16_to_cpu(org_device->target), 0); return -ENODEV; } @@ -2029,36 +2155,54 @@ static int spraid_dev_list_init(struct spraid_dev *hdev) return 0; }
+static int luntarget_cmp_func(const void *l, const void *r) +{ + const struct spraid_dev_info *ln = l; + const struct spraid_dev_info *rn = r; + + if (ln->channel == rn->channel) + return le16_to_cpu(ln->target) - le16_to_cpu(rn->target); + + return ln->channel - rn->channel; +} + static void spraid_scan_work(struct work_struct *work) { struct spraid_dev *hdev = container_of(work, struct spraid_dev, scan_work); struct spraid_dev_info *devices, *org_devices; + struct spraid_dev_info *sortdevice; u32 nd = le32_to_cpu(hdev->ctrl_info->nd); u8 flag, org_flag; int i, ret; + int count = 0;
devices = kcalloc(nd, sizeof(struct spraid_dev_info), GFP_KERNEL); if (!devices) return; + + sortdevice = kcalloc(nd, sizeof(struct spraid_dev_info), GFP_KERNEL); + if (!sortdevice) + goto free_list; + ret = spraid_get_dev_list(hdev, devices); if (ret) - goto free_list; + goto free_all; org_devices = hdev->devices; for (i = 0; i < nd; i++) { org_flag = org_devices[i].flag; flag = devices[i].flag;
- dev_log_dbg(hdev->dev, "i: %d, org_flag: 0x%x, flag: 0x%x\n", - i, org_flag, flag); + dev_log_dbg(hdev->dev, "i: %d, org_flag: 0x%x, flag: 0x%x\n", i, org_flag, flag);
if (SPRAID_DEV_INFO_FLAG_VALID(flag)) { if (!SPRAID_DEV_INFO_FLAG_VALID(org_flag)) { down_write(&hdev->devices_rwsem); memcpy(&org_devices[i], &devices[i], - sizeof(struct spraid_dev_info)); + sizeof(struct spraid_dev_info)); + memcpy(&sortdevice[count++], &devices[i], + sizeof(struct spraid_dev_info)); up_write(&hdev->devices_rwsem); - spraid_add_device(hdev, &devices[i]); } else if (SPRAID_DEV_INFO_FLAG_CHANGE(flag)) { spraid_rescan_device(hdev, &devices[i]); } @@ -2071,6 +2215,16 @@ static void spraid_scan_work(struct work_struct *work) } } } + + dev_info(hdev->dev, "scan work add device count = %d\n", count); + + sort(sortdevice, count, sizeof(sortdevice[0]), luntarget_cmp_func, NULL); + + for (i = 0; i < count; i++) + spraid_add_device(hdev, &sortdevice[i]); + +free_all: + kfree(sortdevice); free_list: kfree(devices); } @@ -2083,6 +2237,15 @@ static void spraid_timesyn_work(struct work_struct *work) spraid_configure_timestamp(hdev); }
+static int spraid_init_ctrl_info(struct spraid_dev *hdev); +static void spraid_fw_act_work(struct work_struct *work) +{ + struct spraid_dev *hdev = container_of(work, struct spraid_dev, fw_act_work); + + if (spraid_init_ctrl_info(hdev)) + dev_err(hdev->dev, "get ctrl info failed after fw act\n"); +} + static void spraid_queue_scan(struct spraid_dev *hdev) { queue_work(spraid_wq, &hdev->scan_work); @@ -2094,6 +2257,9 @@ static void spraid_handle_aen_notice(struct spraid_dev *hdev, u32 result) case SPRAID_AEN_DEV_CHANGED: spraid_queue_scan(hdev); break; + case SPRAID_AEN_FW_ACT_START: + dev_info(hdev->dev, "fw activation starting\n"); + break; case SPRAID_AEN_HOST_PROBING: break; default: @@ -2101,25 +2267,25 @@ static void spraid_handle_aen_notice(struct spraid_dev *hdev, u32 result) } }
-static void spraid_handle_aen_vs(struct spraid_dev *hdev, u32 result) +static void spraid_handle_aen_vs(struct spraid_dev *hdev, u32 result, u32 result1) { - switch (result) { + switch ((result & 0xff00) >> 8) { case SPRAID_AEN_TIMESYN: queue_work(spraid_wq, &hdev->timesyn_work); break; + case SPRAID_AEN_FW_ACT_FINISH: + dev_info(hdev->dev, "fw activation finish\n"); + queue_work(spraid_wq, &hdev->fw_act_work); + break; + case SPRAID_AEN_EVENT_MIN ... SPRAID_AEN_EVENT_MAX: + dev_info(hdev->dev, "rcv card event[%d], param1[0x%x] param2[0x%x]\n", + (result & 0xff00) >> 8, result, result1); + break; default: - dev_warn(hdev->dev, "async event result: %x\n", result); + dev_warn(hdev->dev, "async event result: 0x%x\n", result); } }
-static void spraid_async_event_work(struct work_struct *work) -{ - struct spraid_dev *hdev = - container_of(work, struct spraid_dev, aen_work); - - spraid_send_aen(hdev); -} - static int spraid_alloc_resources(struct spraid_dev *hdev) { int ret, nqueue; @@ -2149,10 +2315,16 @@ static int spraid_alloc_resources(struct spraid_dev *hdev) goto destroy_dma_pools; }
+ ret = spraid_alloc_admin_cmds(hdev); + if (ret) + goto free_queues; + dev_info(hdev->dev, "[%s] queues num: %d\n", __func__, nqueue);
return 0;
+free_queues: + kfree(hdev->queues); destroy_dma_pools: spraid_destroy_dma_pools(hdev); free_ctrl_info: @@ -2164,50 +2336,18 @@ static int spraid_alloc_resources(struct spraid_dev *hdev)
static void spraid_free_resources(struct spraid_dev *hdev) { + spraid_free_admin_cmds(hdev); kfree(hdev->queues); spraid_destroy_dma_pools(hdev); kfree(hdev->ctrl_info); ida_free(&spraid_instance_ida, hdev->instance); }
-static void spraid_setup_passthrough(struct request *req, struct spraid_admin_command *cmd) -{ - memcpy(cmd, spraid_admin_req(req)->cmd, sizeof(*cmd)); - cmd->common.flags &= ~SPRAID_CMD_FLAG_SGL_ALL; -} - -static inline void spraid_clear_hreq(struct request *req) -{ - if (!(req->rq_flags & RQF_DONTPREP)) { - spraid_admin_req(req)->flags = 0; - req->rq_flags |= RQF_DONTPREP; - } -} - -static blk_status_t spraid_setup_admin_cmd(struct request *req, struct spraid_admin_command *cmd) +static void spraid_bsg_unmap_data(struct spraid_dev *hdev, struct bsg_job *job) { - spraid_clear_hreq(req); - - memset(cmd, 0, sizeof(*cmd)); - switch (req_op(req)) { - case REQ_OP_DRV_IN: - case REQ_OP_DRV_OUT: - spraid_setup_passthrough(req, cmd); - break; - default: - WARN_ON_ONCE(1); - return BLK_STS_IOERR; - } - - cmd->common.command_id = req->tag; - return BLK_STS_OK; -} - -static void spraid_unmap_data(struct spraid_dev *hdev, struct request *req) -{ - struct spraid_iod *iod = blk_mq_rq_to_pdu(req); - enum dma_data_direction dma_dir = rq_data_dir(req) ? - DMA_TO_DEVICE : DMA_FROM_DEVICE; + struct request *rq = blk_mq_rq_from_pdu(job); + struct spraid_iod *iod = job->dd_data; + enum dma_data_direction dma_dir = rq_data_dir(rq) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
if (iod->nsge) dma_unmap_sg(hdev->dev, iod->sg, iod->nsge, dma_dir); @@ -2215,36 +2355,36 @@ static void spraid_unmap_data(struct spraid_dev *hdev, struct request *req) spraid_free_iod_res(hdev, iod); }
-static blk_status_t spraid_admin_map_data(struct spraid_dev *hdev, struct request *req, - struct spraid_admin_command *cmd) +static int spraid_bsg_map_data(struct spraid_dev *hdev, struct bsg_job *job, + struct spraid_admin_command *cmd) { - struct spraid_iod *iod = blk_mq_rq_to_pdu(req); - struct request_queue *admin_q = req->q; - enum dma_data_direction dma_dir = rq_data_dir(req) ? - DMA_TO_DEVICE : DMA_FROM_DEVICE; - blk_status_t ret = BLK_STS_IOERR; - int nr_mapped; - int res; + struct request *rq = blk_mq_rq_from_pdu(job); + struct spraid_iod *iod = job->dd_data; + enum dma_data_direction dma_dir = rq_data_dir(rq) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; + int ret = 0; + + iod->sg = job->request_payload.sg_list; + iod->nsge = job->request_payload.sg_cnt; + iod->length = job->request_payload.payload_len; + iod->use_sgl = false; + iod->npages = -1; + iod->sg_drv_mgmt = false;
- sg_init_table(iod->sg, blk_rq_nr_phys_segments(req)); - iod->nsge = blk_rq_map_sg(admin_q, req, iod->sg); if (!iod->nsge) goto out;
- dev_info(hdev->dev, "nseg: %u, nsge: %u\n", - blk_rq_nr_phys_segments(req), iod->nsge); - - ret = BLK_STS_RESOURCE; - nr_mapped = dma_map_sg_attrs(hdev->dev, iod->sg, iod->nsge, dma_dir, DMA_ATTR_NO_WARN); - if (!nr_mapped) + ret = dma_map_sg_attrs(hdev->dev, iod->sg, iod->nsge, dma_dir, DMA_ATTR_NO_WARN); + if (!ret) goto out;
- res = spraid_setup_prps(hdev, iod); - if (res) + ret = spraid_setup_prps(hdev, iod); + if (ret) goto unmap; + cmd->common.dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); cmd->common.dptr.prp2 = cpu_to_le64(iod->first_dma); - return BLK_STS_OK; + + return 0;
unmap: dma_unmap_sg(hdev->dev, iod->sg, iod->nsge, dma_dir); @@ -2252,137 +2392,29 @@ static blk_status_t spraid_admin_map_data(struct spraid_dev *hdev, struct reques return ret; }
-static blk_status_t spraid_init_admin_iod(struct request *rq, struct spraid_dev *hdev) -{ - struct spraid_iod *iod = blk_mq_rq_to_pdu(rq); - int nents = blk_rq_nr_phys_segments(rq); - unsigned int size = blk_rq_payload_bytes(rq); - - if (nents > SPRAID_INT_PAGES || size > SPRAID_INT_BYTES(hdev)) { - iod->sg = mempool_alloc(hdev->iod_mempool, GFP_ATOMIC); - if (!iod->sg) - return BLK_STS_RESOURCE; - } else { - iod->sg = iod->inline_sg; - } - - iod->nsge = 0; - iod->use_sgl = false; - iod->npages = -1; - iod->length = size; - iod->sg_drv_mgmt = true; - - return BLK_STS_OK; -} - -static blk_status_t spraid_queue_admin_rq(struct blk_mq_hw_ctx *hctx, - const struct blk_mq_queue_data *bd) -{ - struct spraid_queue *adminq = hctx->driver_data; - struct spraid_dev *hdev = adminq->hdev; - struct request *req = bd->rq; - struct spraid_iod *iod = blk_mq_rq_to_pdu(req); - struct spraid_admin_command cmd; - blk_status_t ret; - - ret = spraid_setup_admin_cmd(req, &cmd); - if (ret) - goto out; - - ret = spraid_init_admin_iod(req, hdev); - if (ret) - goto out; - - if (blk_rq_nr_phys_segments(req)) { - ret = spraid_admin_map_data(hdev, req, &cmd); - if (ret) - goto cleanup_iod; - } - - blk_mq_start_request(req); - spraid_submit_cmd(adminq, &cmd); - return BLK_STS_OK; - -cleanup_iod: - spraid_free_iod_res(hdev, iod); -out: - return ret; -} - -static blk_status_t spraid_error_status(struct request *req) -{ - switch (spraid_admin_req(req)->status & 0x7ff) { - case SPRAID_SC_SUCCESS: - return BLK_STS_OK; - default: - return BLK_STS_IOERR; - } -} - -static void spraid_complete_admin_rq(struct request *req) -{ - struct spraid_iod *iod = blk_mq_rq_to_pdu(req); - struct spraid_dev *hdev = iod->spraidq->hdev; - - if (blk_rq_nr_phys_segments(req)) - spraid_unmap_data(hdev, req); - blk_mq_end_request(req, spraid_error_status(req)); -} - -static int spraid_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) -{ - struct spraid_dev *hdev = data; - struct spraid_queue *adminq = &hdev->queues[0]; - - WARN_ON(hctx_idx != 0); - WARN_ON(hdev->admin_tagset.tags[0] != hctx->tags); - - hctx->driver_data = adminq; - return 0; -} - -static int spraid_admin_init_request(struct blk_mq_tag_set *set, struct request *req, - unsigned int hctx_idx, unsigned int numa_node) -{ - struct spraid_dev *hdev = set->driver_data; - struct spraid_iod *iod = blk_mq_rq_to_pdu(req); - struct spraid_queue *adminq = &hdev->queues[0]; - - WARN_ON(!adminq); - iod->spraidq = adminq; - return 0; -} - -static enum blk_eh_timer_return -spraid_admin_timeout(struct request *req, bool reserved) -{ - struct spraid_iod *iod = blk_mq_rq_to_pdu(req); - struct spraid_queue *spraidq = iod->spraidq; - struct spraid_dev *hdev = spraidq->hdev; - - dev_err(hdev->dev, "Admin cid[%d] qid[%d] timeout\n", - req->tag, spraidq->qid); - - if (spraid_poll_cq(spraidq, req->tag)) { - dev_warn(hdev->dev, "cid[%d] qid[%d] timeout, completion polled\n", - req->tag, spraidq->qid); - return BLK_EH_DONE; - } - - spraid_end_admin_request(req, cpu_to_le16(-EINVAL), 0, 0); - return BLK_EH_DONE; -} - static int spraid_get_ctrl_info(struct spraid_dev *hdev, struct spraid_ctrl_info *ctrl_info) { struct spraid_admin_command admin_cmd; + u8 *data_ptr = NULL; + dma_addr_t data_dma = 0; + int ret; + + data_ptr = dma_alloc_coherent(hdev->dev, PAGE_SIZE, &data_dma, GFP_KERNEL); + if (!data_ptr) + return -ENOMEM;
memset(&admin_cmd, 0, sizeof(admin_cmd)); admin_cmd.get_info.opcode = SPRAID_ADMIN_GET_INFO; admin_cmd.get_info.type = SPRAID_GET_INFO_CTRL; + admin_cmd.common.dptr.prp1 = cpu_to_le64(data_dma); + + ret = spraid_submit_admin_sync_cmd(hdev, &admin_cmd, NULL, NULL, 0); + if (!ret) + memcpy(ctrl_info, data_ptr, sizeof(struct spraid_ctrl_info));
- return spraid_submit_admin_sync_cmd(hdev->admin_q, &admin_cmd, NULL, - ctrl_info, sizeof(struct spraid_ctrl_info), 0, 0, 0); + dma_free_coherent(hdev->dev, PAGE_SIZE, data_ptr, data_dma); + + return ret; }
static int spraid_init_ctrl_info(struct spraid_dev *hdev) @@ -2416,6 +2448,11 @@ static int spraid_init_ctrl_info(struct spraid_dev *hdev) dev_info(hdev->dev, "[%s]sn = %s\n", __func__, hdev->ctrl_info->sn); dev_info(hdev->dev, "[%s]fr = %s\n", __func__, hdev->ctrl_info->fr);
+ if (!hdev->ctrl_info->aerl) + hdev->ctrl_info->aerl = 1; + if (hdev->ctrl_info->aerl > SPRAID_NR_AEN_COMMANDS) + hdev->ctrl_info->aerl = SPRAID_NR_AEN_COMMANDS; + return 0; }
@@ -2444,98 +2481,51 @@ static void spraid_free_iod_ext_mem_pool(struct spraid_dev *hdev) mempool_destroy(hdev->iod_mempool); }
-static int spraid_submit_user_cmd(struct request_queue *q, struct spraid_admin_command *cmd, - void __user *ubuffer, unsigned int bufflen, u32 *result, - unsigned int timeout) +static int spraid_user_admin_cmd(struct spraid_dev *hdev, struct bsg_job *job) { - struct request *req; - struct bio *bio = NULL; - int ret; - - req = spraid_alloc_admin_request(q, cmd, 0); - if (IS_ERR(req)) - return PTR_ERR(req); + struct spraid_bsg_request *bsg_req = job->request; + struct spraid_passthru_common_cmd *cmd = &(bsg_req->admcmd); + struct spraid_admin_command admin_cmd; + u32 timeout = msecs_to_jiffies(cmd->timeout_ms); + u32 result[2] = {0}; + int status;
- req->timeout = timeout ? timeout : ADMIN_TIMEOUT; - spraid_admin_req(req)->flags |= SPRAID_REQ_USERCMD; + if (hdev->state >= SPRAID_RESETTING) { + dev_err(hdev->dev, "[%s] err, host state:[%d] is not right\n", + __func__, hdev->state); + return -EBUSY; + }
- if (ubuffer && bufflen) { - ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, GFP_KERNEL); - if (ret) - goto out; - bio = req->bio; + memset(&admin_cmd, 0, sizeof(admin_cmd)); + admin_cmd.common.opcode = cmd->opcode; + admin_cmd.common.flags = cmd->flags; + admin_cmd.common.hdid = cpu_to_le32(cmd->nsid); + admin_cmd.common.cdw2[0] = cpu_to_le32(cmd->cdw2); + admin_cmd.common.cdw2[1] = cpu_to_le32(cmd->cdw3); + admin_cmd.common.cdw10 = cpu_to_le32(cmd->cdw10); + admin_cmd.common.cdw11 = cpu_to_le32(cmd->cdw11); + admin_cmd.common.cdw12 = cpu_to_le32(cmd->cdw12); + admin_cmd.common.cdw13 = cpu_to_le32(cmd->cdw13); + admin_cmd.common.cdw14 = cpu_to_le32(cmd->cdw14); + admin_cmd.common.cdw15 = cpu_to_le32(cmd->cdw15); + + status = spraid_bsg_map_data(hdev, job, &admin_cmd); + if (status) { + dev_err(hdev->dev, "[%s] err, map data failed\n", __func__); + return status; } - blk_execute_rq(req->q, NULL, req, 0); - if (spraid_admin_req(req)->flags & SPRAID_REQ_CANCELLED) - ret = -EINTR; - else - ret = spraid_admin_req(req)->status; - if (result) { - result[0] = spraid_admin_req(req)->result0; - result[1] = spraid_admin_req(req)->result1; - } - if (bio) - blk_rq_unmap_user(bio); -out: - blk_mq_free_request(req); - return ret; -}
-static int spraid_user_admin_cmd(struct spraid_dev *hdev, - struct spraid_passthru_common_cmd __user *ucmd) -{ - struct spraid_passthru_common_cmd cmd; - struct spraid_admin_command admin_cmd; - u32 timeout = 0; - int status; - - if (!capable(CAP_SYS_ADMIN)) { - dev_err(hdev->dev, "Current user hasn't administrator right, reject service\n"); - return -EACCES; - } - - if (copy_from_user(&cmd, ucmd, sizeof(cmd))) { - dev_err(hdev->dev, "Copy command from user space to kernel space failed\n"); - return -EFAULT; - } - - if (cmd.flags) { - dev_err(hdev->dev, "Invalid flags in user command\n"); - return -EINVAL; + status = spraid_submit_admin_sync_cmd(hdev, &admin_cmd, &result[0], &result[1], timeout); + if (status >= 0) { + job->reply_len = sizeof(result); + memcpy(job->reply, result, sizeof(result)); }
- dev_info(hdev->dev, "user_admin_cmd opcode: 0x%x, subopcode: 0x%x\n", - cmd.opcode, cmd.cdw2 & 0x7ff); + if (status) + dev_info(hdev->dev, "[%s] opcode[0x%x] subopcode[0x%x], status[0x%x] result0[0x%x] result1[0x%x]\n", + __func__, cmd->opcode, cmd->info_0.subopcode, status, result[0], result[1]);
- memset(&admin_cmd, 0, sizeof(admin_cmd)); - admin_cmd.common.opcode = cmd.opcode; - admin_cmd.common.flags = cmd.flags; - admin_cmd.common.hdid = cpu_to_le32(cmd.nsid); - admin_cmd.common.cdw2[0] = cpu_to_le32(cmd.cdw2); - admin_cmd.common.cdw2[1] = cpu_to_le32(cmd.cdw3); - admin_cmd.common.cdw10 = cpu_to_le32(cmd.cdw10); - admin_cmd.common.cdw11 = cpu_to_le32(cmd.cdw11); - admin_cmd.common.cdw12 = cpu_to_le32(cmd.cdw12); - admin_cmd.common.cdw13 = cpu_to_le32(cmd.cdw13); - admin_cmd.common.cdw14 = cpu_to_le32(cmd.cdw14); - admin_cmd.common.cdw15 = cpu_to_le32(cmd.cdw15); - - if (cmd.timeout_ms) - timeout = msecs_to_jiffies(cmd.timeout_ms); - - status = spraid_submit_user_cmd(hdev->admin_q, &admin_cmd, - (void __user *)(uintptr_t)cmd.addr, cmd.info_1.data_len, - &cmd.result0, timeout); - - dev_info(hdev->dev, "user_admin_cmd status: 0x%x, result0: 0x%x, result1: 0x%x\n", - status, cmd.result0, cmd.result1); - - if (status >= 0) { - if (put_user(cmd.result0, &ucmd->result0)) - return -EFAULT; - if (put_user(cmd.result1, &ucmd->result1)) - return -EFAULT; - } + spraid_bsg_unmap_data(hdev, job);
return status; } @@ -2548,8 +2538,8 @@ static int spraid_alloc_ioq_ptcmds(struct spraid_dev *hdev) INIT_LIST_HEAD(&hdev->ioq_pt_list); spin_lock_init(&hdev->ioq_pt_lock);
- hdev->ioq_ptcmds = kcalloc_node(ptnum, sizeof(struct spraid_ioq_ptcmd), - GFP_KERNEL, hdev->numa_node); + hdev->ioq_ptcmds = kcalloc_node(ptnum, sizeof(struct spraid_cmd), + GFP_KERNEL, hdev->numa_node);
if (!hdev->ioq_ptcmds) { dev_err(hdev->dev, "Alloc ioq_ptcmds failed\n"); @@ -2567,55 +2557,35 @@ static int spraid_alloc_ioq_ptcmds(struct spraid_dev *hdev) return 0; }
-static struct spraid_ioq_ptcmd *spraid_get_ioq_ptcmd(struct spraid_dev *hdev) -{ - struct spraid_ioq_ptcmd *cmd = NULL; - unsigned long flags; - - spin_lock_irqsave(&hdev->ioq_pt_lock, flags); - if (list_empty(&hdev->ioq_pt_list)) { - spin_unlock_irqrestore(&hdev->ioq_pt_lock, flags); - dev_err(hdev->dev, "err, ioq ptcmd list empty\n"); - return NULL; - } - cmd = list_entry((&hdev->ioq_pt_list)->next, struct spraid_ioq_ptcmd, list); - list_del_init(&cmd->list); - spin_unlock_irqrestore(&hdev->ioq_pt_lock, flags); - - WRITE_ONCE(cmd->state, SPRAID_CMD_IDLE); - - return cmd; -} - -static void spraid_put_ioq_ptcmd(struct spraid_dev *hdev, struct spraid_ioq_ptcmd *cmd) +static void spraid_free_ioq_ptcmds(struct spraid_dev *hdev) { - unsigned long flags; + kfree(hdev->ioq_ptcmds); + hdev->ioq_ptcmds = NULL;
- spin_lock_irqsave(&hdev->ioq_pt_lock, flags); - list_add(&cmd->list, (&hdev->ioq_pt_list)->next); - spin_unlock_irqrestore(&hdev->ioq_pt_lock, flags); + INIT_LIST_HEAD(&hdev->ioq_pt_list); }
static int spraid_submit_ioq_sync_cmd(struct spraid_dev *hdev, struct spraid_ioq_command *cmd, - u32 *result, void **sense, u32 timeout) + u32 *result, u32 *reslen, u32 timeout) { - struct spraid_queue *ioq; int ret; dma_addr_t sense_dma; - struct spraid_ioq_ptcmd *pt_cmd = spraid_get_ioq_ptcmd(hdev); - - *sense = NULL; + struct spraid_queue *ioq; + void *sense_addr = NULL; + struct spraid_cmd *pt_cmd = spraid_get_cmd(hdev, SPRAID_CMD_IOPT);
- if (!pt_cmd) + if (!pt_cmd) { + dev_err(hdev->dev, "err, get ioq cmd failed\n"); return -EFAULT; + }
- dev_info(hdev->dev, "[%s] ptcmd, cid[%d], qid[%d]\n", __func__, pt_cmd->cid, pt_cmd->qid); + timeout = timeout ? timeout : ADMIN_TIMEOUT;
init_completion(&pt_cmd->cmd_done);
ioq = &hdev->queues[pt_cmd->qid]; ret = pt_cmd->cid * SCSI_SENSE_BUFFERSIZE; - pt_cmd->priv = ioq->sense + ret; + sense_addr = ioq->sense + ret; sense_dma = ioq->sense_dma_addr + ret;
cmd->common.sense_addr = cpu_to_le64(sense_dma); @@ -2625,260 +2595,87 @@ static int spraid_submit_ioq_sync_cmd(struct spraid_dev *hdev, struct spraid_ioq spraid_submit_cmd(ioq, cmd);
if (!wait_for_completion_timeout(&pt_cmd->cmd_done, timeout)) { - dev_err(hdev->dev, "[%s] cid[%d], qid[%d] timeout\n", - __func__, pt_cmd->cid, pt_cmd->qid); + dev_err(hdev->dev, "[%s] cid[%d] qid[%d] timeout, opcode[0x%x] subopcode[0x%x]\n", + __func__, pt_cmd->cid, pt_cmd->qid, cmd->common.opcode, + (le32_to_cpu(cmd->common.cdw3[0]) & 0xffff)); WRITE_ONCE(pt_cmd->state, SPRAID_CMD_TIMEOUT); - return -EINVAL; + spraid_put_cmd(hdev, pt_cmd, SPRAID_CMD_IOPT); + return -ETIME; }
- if (result) { - result[0] = pt_cmd->result0; - result[1] = pt_cmd->result1; + if (result && reslen) { + if ((pt_cmd->status & 0x17f) == 0x101) { + memcpy(result, sense_addr, SCSI_SENSE_BUFFERSIZE); + *reslen = SCSI_SENSE_BUFFERSIZE; + } }
- if ((pt_cmd->status & 0x17f) == 0x101) - *sense = pt_cmd->priv; + spraid_put_cmd(hdev, pt_cmd, SPRAID_CMD_IOPT);
return pt_cmd->status; }
-static int spraid_user_ioq_cmd(struct spraid_dev *hdev, - struct spraid_ioq_passthru_cmd __user *ucmd) +static int spraid_user_ioq_cmd(struct spraid_dev *hdev, struct bsg_job *job) { - struct spraid_ioq_passthru_cmd cmd; + struct spraid_bsg_request *bsg_req = (struct spraid_bsg_request *)(job->request); + struct spraid_ioq_passthru_cmd *cmd = &(bsg_req->ioqcmd); struct spraid_ioq_command ioq_cmd; - u32 timeout = 0; int status = 0; - u8 *data_ptr = NULL; - dma_addr_t data_dma; - enum dma_data_direction dma_dir = DMA_NONE; - void *sense = NULL; - - if (!capable(CAP_SYS_ADMIN)) { - dev_err(hdev->dev, "Current user hasn't administrator right, reject service\n"); - return -EACCES; - } + u32 timeout = msecs_to_jiffies(cmd->timeout_ms);
- if (copy_from_user(&cmd, ucmd, sizeof(cmd))) { - dev_err(hdev->dev, "Copy command from user space to kernel space failed\n"); - return -EFAULT; - } - - if (cmd.data_len > PAGE_SIZE) { + if (cmd->data_len > PAGE_SIZE) { dev_err(hdev->dev, "[%s] data len bigger than 4k\n", __func__); return -EFAULT; }
- dev_info(hdev->dev, "[%s] opcode: 0x%x, subopcode: 0x%x, datalen: %d\n", - __func__, cmd.opcode, cmd.info_1.subopcode, cmd.data_len); - - if (cmd.addr && cmd.data_len) { - data_ptr = dma_alloc_coherent(hdev->dev, PAGE_SIZE, &data_dma, GFP_KERNEL); - if (!data_ptr) - return -ENOMEM; - - dma_dir = (cmd.opcode & 1) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; + if (hdev->state != SPRAID_LIVE) { + dev_err(hdev->dev, "[%s] err, host state:[%d] is not live\n", + __func__, hdev->state); + return -EBUSY; }
- if (dma_dir == DMA_TO_DEVICE) { - if (copy_from_user(data_ptr, (void __user *)(uintptr_t)cmd.addr, cmd.data_len)) { - dev_err(hdev->dev, "[%s] copy user data failed\n", __func__); - status = -EFAULT; - goto free_dma_mem; - } - } + dev_info(hdev->dev, "[%s] opcode[0x%x] subopcode[0x%x] init, datalen[%d]\n", + __func__, cmd->opcode, cmd->info_1.subopcode, cmd->data_len);
memset(&ioq_cmd, 0, sizeof(ioq_cmd)); - ioq_cmd.common.opcode = cmd.opcode; - ioq_cmd.common.flags = cmd.flags; - ioq_cmd.common.hdid = cpu_to_le32(cmd.nsid); - ioq_cmd.common.sense_len = cpu_to_le16(cmd.info_0.res_sense_len); - ioq_cmd.common.cdb_len = cmd.info_0.cdb_len; - ioq_cmd.common.rsvd2 = cmd.info_0.rsvd0; - ioq_cmd.common.cdw3[0] = cpu_to_le32(cmd.cdw3); - ioq_cmd.common.cdw3[1] = cpu_to_le32(cmd.cdw4); - ioq_cmd.common.cdw3[2] = cpu_to_le32(cmd.cdw5); - ioq_cmd.common.dptr.prp1 = cpu_to_le64(data_dma); - - ioq_cmd.common.cdw10[0] = cpu_to_le32(cmd.cdw10); - ioq_cmd.common.cdw10[1] = cpu_to_le32(cmd.cdw11); - ioq_cmd.common.cdw10[2] = cpu_to_le32(cmd.cdw12); - ioq_cmd.common.cdw10[3] = cpu_to_le32(cmd.cdw13); - ioq_cmd.common.cdw10[4] = cpu_to_le32(cmd.cdw14); - ioq_cmd.common.cdw10[5] = cpu_to_le32(cmd.data_len); - - memcpy(ioq_cmd.common.cdb, &cmd.cdw16, cmd.info_0.cdb_len); - - ioq_cmd.common.cdw26[0] = cpu_to_le32(cmd.cdw26[0]); - ioq_cmd.common.cdw26[1] = cpu_to_le32(cmd.cdw26[1]); - ioq_cmd.common.cdw26[2] = cpu_to_le32(cmd.cdw26[2]); - ioq_cmd.common.cdw26[3] = cpu_to_le32(cmd.cdw26[3]); - - if (cmd.timeout_ms) - timeout = msecs_to_jiffies(cmd.timeout_ms); - timeout = timeout ? timeout : ADMIN_TIMEOUT; - - status = spraid_submit_ioq_sync_cmd(hdev, &ioq_cmd, &cmd.result0, &sense, timeout); - - if (status >= 0) { - if (put_user(cmd.result0, &ucmd->result0)) { - status = -EFAULT; - goto free_dma_mem; - } - if (put_user(cmd.result1, &ucmd->result1)) { - status = -EFAULT; - goto free_dma_mem; - } - if (dma_dir == DMA_FROM_DEVICE && - copy_to_user((void __user *)(uintptr_t)cmd.addr, data_ptr, cmd.data_len)) { - status = -EFAULT; - goto free_dma_mem; - } - } - - if (sense) { - if (copy_to_user((void *__user *)(uintptr_t)cmd.sense_addr, - sense, cmd.info_0.res_sense_len)) { - status = -EFAULT; - goto free_dma_mem; - } - } - -free_dma_mem: - if (data_ptr) - dma_free_coherent(hdev->dev, PAGE_SIZE, data_ptr, data_dma); - - return status; - -} - -static int spraid_reset_work_sync(struct spraid_dev *hdev); - -static int spraid_user_reset_cmd(struct spraid_dev *hdev) -{ - int ret; - - dev_info(hdev->dev, "[%s] start user reset cmd\n", __func__); - ret = spraid_reset_work_sync(hdev); - dev_info(hdev->dev, "[%s] stop user reset cmd[%d]\n", __func__, ret); - - return ret; -} - -static int hdev_open(struct inode *inode, struct file *file) -{ - struct spraid_dev *hdev = - container_of(inode->i_cdev, struct spraid_dev, cdev); - file->private_data = hdev; - return 0; -} - -static long hdev_ioctl(struct file *file, u32 cmd, unsigned long arg) -{ - struct spraid_dev *hdev = file->private_data; - void __user *argp = (void __user *)arg; - - switch (cmd) { - case SPRAID_IOCTL_ADMIN_CMD: - return spraid_user_admin_cmd(hdev, argp); - case SPRAID_IOCTL_IOQ_CMD: - return spraid_user_ioq_cmd(hdev, argp); - case SPRAID_IOCTL_RESET_CMD: - return spraid_user_reset_cmd(hdev); - default: - return -ENOTTY; - } -} - -static const struct file_operations spraid_dev_fops = { - .owner = THIS_MODULE, - .open = hdev_open, - .unlocked_ioctl = hdev_ioctl, - .compat_ioctl = hdev_ioctl, -}; - -static int spraid_create_cdev(struct spraid_dev *hdev) -{ - int ret; - - device_initialize(&hdev->ctrl_device); - hdev->ctrl_device.devt = MKDEV(MAJOR(spraid_chr_devt), hdev->instance); - hdev->ctrl_device.class = spraid_class; - hdev->ctrl_device.parent = hdev->dev; - dev_set_drvdata(&hdev->ctrl_device, hdev); - ret = dev_set_name(&hdev->ctrl_device, "spraid%d", hdev->instance); - if (ret) - return ret; - cdev_init(&hdev->cdev, &spraid_dev_fops); - hdev->cdev.owner = THIS_MODULE; - ret = cdev_device_add(&hdev->cdev, &hdev->ctrl_device); - if (ret) { - dev_err(hdev->dev, "Add cdev failed, ret: %d", ret); - put_device(&hdev->ctrl_device); - kfree_const(hdev->ctrl_device.kobj.name); - return ret; - } - - return 0; -} - -static inline void spraid_remove_cdev(struct spraid_dev *hdev) -{ - cdev_device_del(&hdev->cdev, &hdev->ctrl_device); -} - -static const struct blk_mq_ops spraid_admin_mq_ops = { - .queue_rq = spraid_queue_admin_rq, - .complete = spraid_complete_admin_rq, - .init_hctx = spraid_admin_init_hctx, - .init_request = spraid_admin_init_request, - .timeout = spraid_admin_timeout, -}; - -static void spraid_remove_admin_tagset(struct spraid_dev *hdev) -{ - if (hdev->admin_q && !blk_queue_dying(hdev->admin_q)) { - blk_mq_unquiesce_queue(hdev->admin_q); - blk_cleanup_queue(hdev->admin_q); - blk_mq_free_tag_set(&hdev->admin_tagset); + ioq_cmd.common.opcode = cmd->opcode; + ioq_cmd.common.flags = cmd->flags; + ioq_cmd.common.hdid = cpu_to_le32(cmd->nsid); + ioq_cmd.common.sense_len = cpu_to_le16(cmd->info_0.res_sense_len); + ioq_cmd.common.cdb_len = cmd->info_0.cdb_len; + ioq_cmd.common.rsvd2 = cmd->info_0.rsvd0; + ioq_cmd.common.cdw3[0] = cpu_to_le32(cmd->cdw3); + ioq_cmd.common.cdw3[1] = cpu_to_le32(cmd->cdw4); + ioq_cmd.common.cdw3[2] = cpu_to_le32(cmd->cdw5); + + ioq_cmd.common.cdw10[0] = cpu_to_le32(cmd->cdw10); + ioq_cmd.common.cdw10[1] = cpu_to_le32(cmd->cdw11); + ioq_cmd.common.cdw10[2] = cpu_to_le32(cmd->cdw12); + ioq_cmd.common.cdw10[3] = cpu_to_le32(cmd->cdw13); + ioq_cmd.common.cdw10[4] = cpu_to_le32(cmd->cdw14); + ioq_cmd.common.cdw10[5] = cpu_to_le32(cmd->data_len); + + memcpy(ioq_cmd.common.cdb, &cmd->cdw16, cmd->info_0.cdb_len); + + ioq_cmd.common.cdw26[0] = cpu_to_le32(cmd->cdw26[0]); + ioq_cmd.common.cdw26[1] = cpu_to_le32(cmd->cdw26[1]); + ioq_cmd.common.cdw26[2] = cpu_to_le32(cmd->cdw26[2]); + ioq_cmd.common.cdw26[3] = cpu_to_le32(cmd->cdw26[3]); + + status = spraid_bsg_map_data(hdev, job, (struct spraid_admin_command *)&ioq_cmd); + if (status) { + dev_err(hdev->dev, "[%s] err, map data failed\n", __func__); + return status; } -}
-static int spraid_alloc_admin_tags(struct spraid_dev *hdev) -{ - if (!hdev->admin_q) { - hdev->admin_tagset.ops = &spraid_admin_mq_ops; - hdev->admin_tagset.nr_hw_queues = 1; + status = spraid_submit_ioq_sync_cmd(hdev, &ioq_cmd, job->reply, &job->reply_len, timeout);
- hdev->admin_tagset.queue_depth = SPRAID_AQ_MQ_TAG_DEPTH; - hdev->admin_tagset.timeout = ADMIN_TIMEOUT; - hdev->admin_tagset.numa_node = hdev->numa_node; - hdev->admin_tagset.cmd_size = - spraid_cmd_size(hdev, true, false); - hdev->admin_tagset.flags = BLK_MQ_F_NO_SCHED; - hdev->admin_tagset.driver_data = hdev; + dev_info(hdev->dev, "[%s] opcode[0x%x] subopcode[0x%x], status[0x%x], reply_len[%d]\n", + __func__, cmd->opcode, cmd->info_1.subopcode, status, job->reply_len);
- if (blk_mq_alloc_tag_set(&hdev->admin_tagset)) { - dev_err(hdev->dev, "Allocate admin tagset failed\n"); - return -ENOMEM; - } + spraid_bsg_unmap_data(hdev, job);
- hdev->admin_q = blk_mq_init_queue(&hdev->admin_tagset); - if (IS_ERR(hdev->admin_q)) { - dev_err(hdev->dev, "Initialize admin request queue failed\n"); - blk_mq_free_tag_set(&hdev->admin_tagset); - return -ENOMEM; - } - if (!blk_get_queue(hdev->admin_q)) { - dev_err(hdev->dev, "Get admin request queue failed\n"); - spraid_remove_admin_tagset(hdev); - hdev->admin_q = NULL; - return -ENODEV; - } - } else { - blk_mq_unquiesce_queue(hdev->admin_q); - } - return 0; + return status; }
static bool spraid_check_scmd_completed(struct scsi_cmnd *scmd) @@ -2891,7 +2688,7 @@ static bool spraid_check_scmd_completed(struct scsi_cmnd *scmd) spraid_get_tag_from_scmd(scmd, &hwq, &cid); spraidq = &hdev->queues[hwq]; if (READ_ONCE(iod->state) == SPRAID_CMD_COMPLETE || spraid_poll_cq(spraidq, cid)) { - dev_warn(hdev->dev, "cid[%d], qid[%d] has been completed\n", + dev_warn(hdev->dev, "cid[%d] qid[%d] has been completed\n", cid, spraidq->qid); return true; } @@ -2927,8 +2724,7 @@ static int spraid_send_abort_cmd(struct spraid_dev *hdev, u32 hdid, u16 qid, u16 admin_cmd.abort.sqid = cpu_to_le16(qid); admin_cmd.abort.cid = cpu_to_le16(cid);
- return spraid_submit_admin_sync_cmd(hdev->admin_q, &admin_cmd, NULL, - NULL, 0, 0, 0, 0); + return spraid_submit_admin_sync_cmd(hdev, &admin_cmd, NULL, NULL, 0); }
/* send reset command by admin quueue temporary */ @@ -2941,8 +2737,7 @@ static int spraid_send_reset_cmd(struct spraid_dev *hdev, int type, u32 hdid) admin_cmd.reset.hdid = cpu_to_le32(hdid); admin_cmd.reset.type = type;
- return spraid_submit_admin_sync_cmd(hdev->admin_q, &admin_cmd, NULL, - NULL, 0, 0, 0, 0); + return spraid_submit_admin_sync_cmd(hdev, &admin_cmd, NULL, NULL, 0); }
static bool spraid_change_host_state(struct spraid_dev *hdev, enum spraid_state newstate) @@ -3022,7 +2817,7 @@ static void spraid_back_fault_cqe(struct spraid_queue *ioq, struct spraid_comple scsi_dma_unmap(scmd); spraid_free_iod_res(hdev, iod); scmd->scsi_done(scmd); - dev_warn(hdev->dev, "Back fault CQE, cid[%d], qid[%d]\n", + dev_warn(hdev->dev, "Back fault CQE, cid[%d] qid[%d]\n", cqe->cmd_id, ioq->qid); }
@@ -3032,6 +2827,8 @@ static void spraid_back_all_io(struct spraid_dev *hdev) struct spraid_queue *ioq; struct spraid_completion cqe = { 0 };
+ scsi_block_requests(hdev->shost); + for (i = 1; i <= hdev->shost->nr_hw_queues; i++) { ioq = &hdev->queues[i]; for (j = 0; j < hdev->shost->can_queue; j++) { @@ -3039,6 +2836,8 @@ static void spraid_back_all_io(struct spraid_dev *hdev) spraid_back_fault_cqe(ioq, &cqe); } } + + scsi_unblock_requests(hdev->shost); }
static void spraid_dev_disable(struct spraid_dev *hdev, bool shutdown) @@ -3106,17 +2905,13 @@ static void spraid_reset_work(struct work_struct *work) if (ret) goto pci_disable;
- ret = spraid_alloc_admin_tags(hdev); - if (ret) - goto pci_disable; - ret = spraid_setup_io_queues(hdev); if (ret || hdev->online_queues <= hdev->shost->nr_hw_queues) goto pci_disable;
spraid_change_host_state(hdev, SPRAID_LIVE);
- spraid_send_aen(hdev); + spraid_send_all_aen(hdev);
return;
@@ -3174,8 +2969,8 @@ static int spraid_abort_handler(struct scsi_cmnd *scmd)
scsi_print_command(scmd);
- if (!spraid_wait_abnl_cmd_done(iod) || spraid_check_scmd_completed(scmd) || - hdev->state != SPRAID_LIVE) + if (hdev->state != SPRAID_LIVE || !spraid_wait_abnl_cmd_done(iod) || + spraid_check_scmd_completed(scmd)) return SUCCESS;
hostdata = scmd->device->hostdata; @@ -3183,7 +2978,7 @@ static int spraid_abort_handler(struct scsi_cmnd *scmd)
dev_warn(hdev->dev, "cid[%d] qid[%d] timeout, aborting\n", cid, hwq); ret = spraid_send_abort_cmd(hdev, hostdata->hdid, hwq, cid); - if (ret != ADMIN_ERR_TIMEOUT) { + if (ret != -ETIME) { ret = spraid_wait_abnl_cmd_done(iod); if (ret) { dev_warn(hdev->dev, "cid[%d] qid[%d] abort failed, not found\n", cid, hwq); @@ -3206,8 +3001,8 @@ static int spraid_tgt_reset_handler(struct scsi_cmnd *scmd)
scsi_print_command(scmd);
- if (!spraid_wait_abnl_cmd_done(iod) || spraid_check_scmd_completed(scmd) || - hdev->state != SPRAID_LIVE) + if (hdev->state != SPRAID_LIVE || !spraid_wait_abnl_cmd_done(iod) || + spraid_check_scmd_completed(scmd)) return SUCCESS;
hostdata = scmd->device->hostdata; @@ -3241,8 +3036,8 @@ static int spraid_bus_reset_handler(struct scsi_cmnd *scmd)
scsi_print_command(scmd);
- if (!spraid_wait_abnl_cmd_done(iod) || spraid_check_scmd_completed(scmd) || - hdev->state != SPRAID_LIVE) + if (hdev->state != SPRAID_LIVE || !spraid_wait_abnl_cmd_done(iod) || + spraid_check_scmd_completed(scmd)) return SUCCESS;
hostdata = scmd->device->hostdata; @@ -3272,7 +3067,7 @@ static int spraid_shost_reset_handler(struct scsi_cmnd *scmd) struct spraid_dev *hdev = shost_priv(scmd->device->host);
scsi_print_command(scmd); - if (spraid_check_scmd_completed(scmd) || hdev->state != SPRAID_LIVE) + if (hdev->state != SPRAID_LIVE || spraid_check_scmd_completed(scmd)) return SUCCESS;
spraid_get_tag_from_scmd(scmd, &hwq, &cid); @@ -3288,6 +3083,62 @@ static int spraid_shost_reset_handler(struct scsi_cmnd *scmd) return SUCCESS; }
+static pci_ers_result_t spraid_pci_error_detected(struct pci_dev *pdev, + pci_channel_state_t state) +{ + struct spraid_dev *hdev = pci_get_drvdata(pdev); + + dev_info(hdev->dev, "enter pci error detect, state:%d\n", state); + + switch (state) { + case pci_channel_io_normal: + dev_warn(hdev->dev, "channel is normal, do nothing\n"); + + return PCI_ERS_RESULT_CAN_RECOVER; + case pci_channel_io_frozen: + dev_warn(hdev->dev, "channel io frozen, need reset controller\n"); + + scsi_block_requests(hdev->shost); + + spraid_change_host_state(hdev, SPRAID_RESETTING); + + return PCI_ERS_RESULT_NEED_RESET; + case pci_channel_io_perm_failure: + dev_warn(hdev->dev, "channel io failure, request disconnect\n"); + + return PCI_ERS_RESULT_DISCONNECT; + } + + return PCI_ERS_RESULT_NEED_RESET; +} + +static pci_ers_result_t spraid_pci_slot_reset(struct pci_dev *pdev) +{ + struct spraid_dev *hdev = pci_get_drvdata(pdev); + + dev_info(hdev->dev, "restart after slot reset\n"); + + pci_restore_state(pdev); + + if (!queue_work(spraid_wq, &hdev->reset_work)) { + dev_err(hdev->dev, "[%s] err, the device is resetting state\n", __func__); + return PCI_ERS_RESULT_NONE; + } + + flush_work(&hdev->reset_work); + + scsi_unblock_requests(hdev->shost); + + return PCI_ERS_RESULT_RECOVERED; +} + +static void spraid_reset_done(struct pci_dev *pdev) +{ + struct spraid_dev *hdev = pci_get_drvdata(pdev); + + dev_info(hdev->dev, "enter spraid reset done\n"); +} + static ssize_t csts_pp_show(struct device *cdev, struct device_attribute *attr, char *buf) { struct Scsi_Host *shost = class_to_shost(cdev); @@ -3347,7 +3198,7 @@ static ssize_t fw_version_show(struct device *cdev, struct device_attribute *att struct Scsi_Host *shost = class_to_shost(cdev); struct spraid_dev *hdev = shost_priv(shost);
- return snprintf(buf, sizeof(hdev->ctrl_info->fr), "%s\n", hdev->ctrl_info->fr); + return snprintf(buf, PAGE_SIZE, "%s\n", hdev->ctrl_info->fr); }
static DEVICE_ATTR_RO(csts_pp); @@ -3365,6 +3216,185 @@ static struct device_attribute *spraid_host_attrs[] = { NULL, };
+static int spraid_get_vd_info(struct spraid_dev *hdev, struct spraid_vd_info *vd_info, u16 vid) +{ + struct spraid_admin_command admin_cmd; + u8 *data_ptr = NULL; + dma_addr_t data_dma = 0; + int ret; + + data_ptr = dma_alloc_coherent(hdev->dev, PAGE_SIZE, &data_dma, GFP_KERNEL); + if (!data_ptr) + return -ENOMEM; + + memset(&admin_cmd, 0, sizeof(admin_cmd)); + admin_cmd.usr_cmd.opcode = USR_CMD_READ; + admin_cmd.usr_cmd.info_0.subopcode = cpu_to_le16(USR_CMD_VDINFO); + admin_cmd.usr_cmd.info_1.data_len = cpu_to_le16(USR_CMD_RDLEN); + admin_cmd.usr_cmd.info_1.param_len = cpu_to_le16(VDINFO_PARAM_LEN); + admin_cmd.usr_cmd.cdw10 = cpu_to_le32(vid); + admin_cmd.common.dptr.prp1 = cpu_to_le64(data_dma); + + ret = spraid_submit_admin_sync_cmd(hdev, &admin_cmd, NULL, NULL, 0); + if (!ret) + memcpy(vd_info, data_ptr, sizeof(struct spraid_vd_info)); + + dma_free_coherent(hdev->dev, PAGE_SIZE, data_ptr, data_dma); + + return ret; +} + +static int spraid_get_bgtask(struct spraid_dev *hdev, struct spraid_bgtask *bgtask) +{ + struct spraid_admin_command admin_cmd; + u8 *data_ptr = NULL; + dma_addr_t data_dma = 0; + int ret; + + data_ptr = dma_alloc_coherent(hdev->dev, PAGE_SIZE, &data_dma, GFP_KERNEL); + if (!data_ptr) + return -ENOMEM; + + memset(&admin_cmd, 0, sizeof(admin_cmd)); + admin_cmd.usr_cmd.opcode = USR_CMD_READ; + admin_cmd.usr_cmd.info_0.subopcode = cpu_to_le16(USR_CMD_BGTASK); + admin_cmd.usr_cmd.info_1.data_len = cpu_to_le16(USR_CMD_RDLEN); + admin_cmd.common.dptr.prp1 = cpu_to_le64(data_dma); + + ret = spraid_submit_admin_sync_cmd(hdev, &admin_cmd, NULL, NULL, 0); + if (!ret) + memcpy(bgtask, data_ptr, sizeof(struct spraid_bgtask)); + + dma_free_coherent(hdev->dev, PAGE_SIZE, data_ptr, data_dma); + + return ret; +} + +static ssize_t raid_level_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct scsi_device *sdev; + struct spraid_dev *hdev; + struct spraid_vd_info *vd_info; + struct spraid_sdev_hostdata *hostdata; + int ret; + + sdev = to_scsi_device(dev); + hdev = shost_priv(sdev->host); + hostdata = sdev->hostdata; + + vd_info = kmalloc(sizeof(*vd_info), GFP_KERNEL); + if (!vd_info || !SPRAID_DEV_INFO_ATTR_VD(hostdata->attr)) + return snprintf(buf, PAGE_SIZE, "NA\n"); + + ret = spraid_get_vd_info(hdev, vd_info, sdev->id); + if (ret) + vd_info->rg_level = ARRAY_SIZE(raid_levels) - 1; + + ret = (vd_info->rg_level < ARRAY_SIZE(raid_levels)) ? + vd_info->rg_level : (ARRAY_SIZE(raid_levels) - 1); + + kfree(vd_info); + + return snprintf(buf, PAGE_SIZE, "RAID-%s\n", raid_levels[ret]); +} + +static ssize_t raid_state_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct scsi_device *sdev; + struct spraid_dev *hdev; + struct spraid_vd_info *vd_info; + struct spraid_sdev_hostdata *hostdata; + int ret; + + sdev = to_scsi_device(dev); + hdev = shost_priv(sdev->host); + hostdata = sdev->hostdata; + + vd_info = kmalloc(sizeof(*vd_info), GFP_KERNEL); + if (!vd_info || !SPRAID_DEV_INFO_ATTR_VD(hostdata->attr)) + return snprintf(buf, PAGE_SIZE, "NA\n"); + + ret = spraid_get_vd_info(hdev, vd_info, sdev->id); + if (ret) { + vd_info->vd_status = 0; + vd_info->rg_id = 0xff; + } + + ret = (vd_info->vd_status < ARRAY_SIZE(raid_states)) ? vd_info->vd_status : 0; + + kfree(vd_info); + + return snprintf(buf, PAGE_SIZE, "%s\n", raid_states[ret]); +} + +static ssize_t raid_resync_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct scsi_device *sdev; + struct spraid_dev *hdev; + struct spraid_vd_info *vd_info; + struct spraid_bgtask *bgtask; + struct spraid_sdev_hostdata *hostdata; + u8 rg_id, i, progress = 0; + int ret; + + sdev = to_scsi_device(dev); + hdev = shost_priv(sdev->host); + hostdata = sdev->hostdata; + + vd_info = kmalloc(sizeof(*vd_info), GFP_KERNEL); + if (!vd_info || !SPRAID_DEV_INFO_ATTR_VD(hostdata->attr)) + return snprintf(buf, PAGE_SIZE, "NA\n"); + + ret = spraid_get_vd_info(hdev, vd_info, sdev->id); + if (ret) + goto out; + + rg_id = vd_info->rg_id; + + bgtask = (struct spraid_bgtask *)vd_info; + ret = spraid_get_bgtask(hdev, bgtask); + if (ret) + goto out; + for (i = 0; i < bgtask->task_num; i++) { + if ((bgtask->bgtask[i].type == BGTASK_TYPE_REBUILD) && + (le16_to_cpu(bgtask->bgtask[i].vd_id) == rg_id)) + progress = bgtask->bgtask[i].progress; + } + +out: + kfree(vd_info); + return snprintf(buf, PAGE_SIZE, "%d\n", progress); +} + +static DEVICE_ATTR_RO(raid_level); +static DEVICE_ATTR_RO(raid_state); +static DEVICE_ATTR_RO(raid_resync); + +static struct device_attribute *spraid_dev_attrs[] = { + &dev_attr_raid_level, + &dev_attr_raid_state, + &dev_attr_raid_resync, + NULL, +}; + +static struct pci_error_handlers spraid_err_handler = { + .error_detected = spraid_pci_error_detected, + .slot_reset = spraid_pci_slot_reset, + .reset_done = spraid_reset_done, +}; + +static int spraid_sysfs_host_reset(struct Scsi_Host *shost, int reset_type) +{ + int ret; + struct spraid_dev *hdev = shost_priv(shost); + + dev_info(hdev->dev, "[%s] start sysfs host reset cmd\n", __func__); + ret = spraid_reset_work_sync(hdev); + dev_info(hdev->dev, "[%s] stop sysfs host reset cmd[%d]\n", __func__, ret); + + return ret; +} + static struct scsi_host_template spraid_driver_template = { .module = THIS_MODULE, .name = "Ramaxel Logic spraid driver", @@ -3379,9 +3409,11 @@ static struct scsi_host_template spraid_driver_template = { .eh_bus_reset_handler = spraid_bus_reset_handler, .eh_host_reset_handler = spraid_shost_reset_handler, .change_queue_depth = scsi_change_queue_depth, - .host_tagset = 1, + .host_tagset = 0, .this_id = -1, .shost_attrs = spraid_host_attrs, + .sdev_attrs = spraid_dev_attrs, + .host_reset = spraid_sysfs_host_reset, };
static void spraid_shutdown(struct pci_dev *pdev) @@ -3392,11 +3424,53 @@ static void spraid_shutdown(struct pci_dev *pdev) spraid_disable_admin_queue(hdev, true); }
+/* bsg dispatch user command */ +static int spraid_bsg_host_dispatch(struct bsg_job *job) +{ + struct Scsi_Host *shost = dev_to_shost(job->dev); + struct spraid_dev *hdev = shost_priv(shost); + struct request *rq = blk_mq_rq_from_pdu(job); + struct spraid_bsg_request *bsg_req = job->request; + int ret = 0; + + dev_log_dbg(hdev->dev, "[%s] msgcode[%d], msglen[%d], timeout[%d], req_nsge[%d], req_len[%d]\n", + __func__, bsg_req->msgcode, job->request_len, rq->timeout, + job->request_payload.sg_cnt, job->request_payload.payload_len); + + job->reply_len = 0; + + switch (bsg_req->msgcode) { + case SPRAID_BSG_ADM: + ret = spraid_user_admin_cmd(hdev, job); + break; + case SPRAID_BSG_IOQ: + ret = spraid_user_ioq_cmd(hdev, job); + break; + default: + dev_info(hdev->dev, "[%s] unsupport msgcode[%d]\n", __func__, bsg_req->msgcode); + break; + } + + if (ret > 0) + ret = ret | (ret << 8); + + bsg_job_done(job, ret, 0); + return 0; +} + +static inline void spraid_remove_bsg(struct spraid_dev *hdev) +{ + if (hdev->bsg_queue) { + bsg_unregister_queue(hdev->bsg_queue); + blk_cleanup_queue(hdev->bsg_queue); + } +} static int spraid_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct spraid_dev *hdev; struct Scsi_Host *shost; int node, ret; + char bsg_name[15];
shost = scsi_host_alloc(&spraid_driver_template, sizeof(*hdev)); if (!shost) { @@ -3421,10 +3495,10 @@ static int spraid_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto put_dev;
init_rwsem(&hdev->devices_rwsem); - INIT_WORK(&hdev->aen_work, spraid_async_event_work); INIT_WORK(&hdev->scan_work, spraid_scan_work); INIT_WORK(&hdev->timesyn_work, spraid_timesyn_work); INIT_WORK(&hdev->reset_work, spraid_reset_work); + INIT_WORK(&hdev->fw_act_work, spraid_fw_act_work); spin_lock_init(&hdev->state_lock);
ret = spraid_alloc_resources(hdev); @@ -3439,17 +3513,13 @@ static int spraid_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (ret) goto pci_disable;
- ret = spraid_alloc_admin_tags(hdev); - if (ret) - goto disable_admin_q; - ret = spraid_init_ctrl_info(hdev); if (ret) - goto free_admin_tagset; + goto disable_admin_q;
ret = spraid_alloc_iod_ext_mem_pool(hdev); if (ret) - goto free_admin_tagset; + goto disable_admin_q;
ret = spraid_setup_io_queues(hdev); if (ret) @@ -3464,9 +3534,15 @@ static int spraid_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto remove_io_queues; }
- ret = spraid_create_cdev(hdev); - if (ret) + snprintf(bsg_name, sizeof(bsg_name), "spraid%d", shost->host_no); + hdev->bsg_queue = bsg_setup_queue(&shost->shost_gendev, bsg_name, + spraid_bsg_host_dispatch, NULL, + spraid_cmd_size(hdev, true, false)); + if (IS_ERR(hdev->bsg_queue)) { + dev_err(hdev->dev, "err, setup bsg failed\n"); + hdev->bsg_queue = NULL; goto remove_io_queues; + }
if (hdev->online_queues == SPRAID_ADMIN_QUEUE_NUM) { dev_warn(hdev->dev, "warn only admin queue can be used\n"); @@ -3475,11 +3551,11 @@ static int spraid_probe(struct pci_dev *pdev, const struct pci_device_id *id)
hdev->state = SPRAID_LIVE;
- spraid_send_aen(hdev); + spraid_send_all_aen(hdev);
ret = spraid_dev_list_init(hdev); if (ret) - goto remove_cdev; + goto remove_bsg;
ret = spraid_configure_timestamp(hdev); if (ret) @@ -3487,20 +3563,18 @@ static int spraid_probe(struct pci_dev *pdev, const struct pci_device_id *id)
ret = spraid_alloc_ioq_ptcmds(hdev); if (ret) - goto remove_cdev; + goto remove_bsg;
scsi_scan_host(hdev->shost);
return 0;
-remove_cdev: - spraid_remove_cdev(hdev); +remove_bsg: + spraid_remove_bsg(hdev); remove_io_queues: spraid_remove_io_queues(hdev); free_iod_mempool: spraid_free_iod_ext_mem_pool(hdev); -free_admin_tagset: - spraid_remove_admin_tagset(hdev); disable_admin_q: spraid_disable_admin_queue(hdev, false); pci_disable: @@ -3524,22 +3598,17 @@ static void spraid_remove(struct pci_dev *pdev) dev_info(hdev->dev, "enter spraid remove\n");
spraid_change_host_state(hdev, SPRAID_DELETING); + flush_work(&hdev->reset_work);
- if (!pci_device_is_present(pdev)) { - scsi_block_requests(shost); + if (!pci_device_is_present(pdev)) spraid_back_all_io(hdev); - scsi_unblock_requests(shost); - }
- flush_work(&hdev->reset_work); + spraid_remove_bsg(hdev); scsi_remove_host(shost); - - kfree(hdev->ioq_ptcmds); + spraid_free_ioq_ptcmds(hdev); kfree(hdev->devices); - spraid_remove_cdev(hdev); spraid_remove_io_queues(hdev); spraid_free_iod_ext_mem_pool(hdev); - spraid_remove_admin_tagset(hdev); spraid_disable_admin_queue(hdev, false); spraid_pci_disable(hdev); spraid_free_resources(hdev); @@ -3551,7 +3620,7 @@ static void spraid_remove(struct pci_dev *pdev) }
static const struct pci_device_id spraid_id_table[] = { - { PCI_DEVICE(PCI_VENDOR_ID_RAMAXEL_LOGIC, SPRAID_SERVER_DEVICE_HAB_DID) }, + { PCI_DEVICE(PCI_VENDOR_ID_RAMAXEL_LOGIC, SPRAID_SERVER_DEVICE_HBA_DID) }, { PCI_DEVICE(PCI_VENDOR_ID_RAMAXEL_LOGIC, SPRAID_SERVER_DEVICE_RAID_DID) }, { 0, } }; @@ -3563,6 +3632,7 @@ static struct pci_driver spraid_driver = { .probe = spraid_probe, .remove = spraid_remove, .shutdown = spraid_shutdown, + .err_handler = &spraid_err_handler, };
static int __init spraid_init(void) @@ -3573,14 +3643,10 @@ static int __init spraid_init(void) if (!spraid_wq) return -ENOMEM;
- ret = alloc_chrdev_region(&spraid_chr_devt, 0, SPRAID_MINORS, "spraid"); - if (ret < 0) - goto destroy_wq; - spraid_class = class_create(THIS_MODULE, "spraid"); if (IS_ERR(spraid_class)) { ret = PTR_ERR(spraid_class); - goto unregister_chrdev; + goto destroy_wq; }
ret = pci_register_driver(&spraid_driver); @@ -3591,8 +3657,6 @@ static int __init spraid_init(void)
destroy_class: class_destroy(spraid_class); -unregister_chrdev: - unregister_chrdev_region(spraid_chr_devt, SPRAID_MINORS); destroy_wq: destroy_workqueue(spraid_wq);
@@ -3603,12 +3667,11 @@ static void __exit spraid_exit(void) { pci_unregister_driver(&spraid_driver); class_destroy(spraid_class); - unregister_chrdev_region(spraid_chr_devt, SPRAID_MINORS); destroy_workqueue(spraid_wq); ida_destroy(&spraid_instance_ida); }
-MODULE_AUTHOR("Ramaxel Memory Technology"); +MODULE_AUTHOR("songyl@ramaxel.com"); MODULE_DESCRIPTION("Ramaxel Memory Technology SPraid Driver"); MODULE_LICENSE("GPL"); MODULE_VERSION(SPRAID_DRV_VERSION);
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.13-rc4 commit b8055ed6779d675e30f019ba3b7141848a4d6558 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/fs...
---------------------------------------------------------------------- In commit 3b0fe47805802, we reduced the free space requirement to perform a pre-write unwritten extent conversion on an S_DAX file. Since we're not actually allocating any space, the logic goes, we only need enough reservation to handle shape changes in the bmbt.
The same logic should have been applied to quota -- we're not allocating any space, so we only need to reserve enough quota to handle the bmbt shape changes.
Fixes: 3b0fe4780580 ("xfs: Don't use reserved blocks for data blocks with DAX") Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_iomap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 37bb6f20dfcf..6c397e9e8674 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -236,7 +236,7 @@ xfs_iomap_write_direct( bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO; if (imap->br_state == XFS_EXT_UNWRITTEN) { tflags |= XFS_TRANS_RESERVE; - resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; + resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; } } error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, resrtextents,
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.13-rc4 commit 4abe21ad67a7b9dc6844f55e91a6e3ef81879d42 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/fs...
----------------------------------------------------------------------
Convert a few xfs_trans_*reserve* callsites that are open-coding other convenience functions.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/libxfs/xfs_bmap.c | 3 +-- fs/xfs/xfs_bmap_util.c | 4 ++-- fs/xfs/xfs_reflink.c | 4 ++-- 3 files changed, 5 insertions(+), 6 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 069b68d09287..ccad814c1baf 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4825,8 +4825,7 @@ xfs_bmap_del_extent_delay( * sb counters as we might have to borrow some blocks for the * indirect block accounting. */ - error = xfs_trans_reserve_quota_nblks(NULL, ip, - -((long)del->br_blockcount), 0, + error = xfs_trans_unreserve_quota_nblks(NULL, ip, del->br_blockcount, 0, isrt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); if (error) return error; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 32180dd5ee7d..597fa5052798 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -879,8 +879,8 @@ xfs_unmap_extent( }
xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, ip->i_gdquot, - ip->i_pdquot, resblks, 0, XFS_QMOPT_RES_REGBLKS); + error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, + XFS_QMOPT_RES_REGBLKS); if (error) goto out_trans_cancel;
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index aa46b75d75af..ee9596fd56ba 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -508,8 +508,8 @@ xfs_reflink_cancel_cow_blocks( xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
/* Remove the quota reservation */ - error = xfs_trans_reserve_quota_nblks(NULL, ip, - -(long)del.br_blockcount, 0, + error = xfs_trans_unreserve_quota_nblks(NULL, ip, + del.br_blockcount, 0, XFS_QMOPT_RES_REGBLKS); if (error) break;
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.13-rc4 commit 8554650003b8a66f3dd357692ab73101d088d938 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/fs...
----------------------------------------------------------------------
Create a couple of convenience wrappers for creating and deleting quota block reservations against future changes.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/libxfs/xfs_bmap.c | 10 ++++------ fs/xfs/xfs_quota.h | 19 +++++++++++++++++++ fs/xfs/xfs_reflink.c | 5 ++--- 3 files changed, 25 insertions(+), 9 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index ccad814c1baf..79f976349a35 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4000,8 +4000,7 @@ xfs_bmapi_reserve_delalloc( * blocks. This number gets adjusted later. We return if we haven't * allocated blocks already inside this loop. */ - error = xfs_trans_reserve_quota_nblks(NULL, ip, (long)alen, 0, - XFS_QMOPT_RES_REGBLKS); + error = xfs_quota_reserve_blkres(ip, alen); if (error) return error;
@@ -4047,8 +4046,7 @@ xfs_bmapi_reserve_delalloc( xfs_mod_fdblocks(mp, alen, false); out_unreserve_quota: if (XFS_IS_QUOTA_ON(mp)) - xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, - XFS_QMOPT_RES_REGBLKS); + xfs_quota_unreserve_blkres(ip, alen); return error; }
@@ -4825,8 +4823,8 @@ xfs_bmap_del_extent_delay( * sb counters as we might have to borrow some blocks for the * indirect block accounting. */ - error = xfs_trans_unreserve_quota_nblks(NULL, ip, del->br_blockcount, 0, - isrt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); + ASSERT(!isrt); + error = xfs_quota_unreserve_blkres(ip, del->br_blockcount); if (error) return error; ip->i_delayed_blks -= del->br_blockcount; diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 5a62398940d0..1d1a1634ea29 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -108,6 +108,12 @@ extern void xfs_qm_mount_quotas(struct xfs_mount *); extern void xfs_qm_unmount(struct xfs_mount *); extern void xfs_qm_unmount_quotas(struct xfs_mount *);
+static inline int +xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks) +{ + return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, + XFS_QMOPT_RES_REGBLKS); +} #else static inline int xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid, @@ -136,6 +142,13 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, { return 0; } + +static inline int +xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks) +{ + return 0; +} + #define xfs_qm_vop_create_dqattach(tp, ip, u, g, p) #define xfs_qm_vop_rename_dqattach(it) (0) #define xfs_qm_vop_chown(tp, ip, old, new) (NULL) @@ -157,6 +170,12 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, pd, nb, ni, \ f | XFS_QMOPT_RES_REGBLKS)
+static inline int +xfs_quota_unreserve_blkres(struct xfs_inode *ip, int64_t blocks) +{ + return xfs_quota_reserve_blkres(ip, -blocks); +} + extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
#endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index ee9596fd56ba..256937a7c249 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -508,9 +508,8 @@ xfs_reflink_cancel_cow_blocks( xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
/* Remove the quota reservation */ - error = xfs_trans_unreserve_quota_nblks(NULL, ip, - del.br_blockcount, 0, - XFS_QMOPT_RES_REGBLKS); + error = xfs_quota_unreserve_blkres(ip, + del.br_blockcount); if (error) break; } else {
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit 35b1101099e85af74a46b8e36f4d1fdac0367ffd category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
xfs_trans_cancel will release all the quota resources that were reserved on behalf of the transaction, so get rid of the explicit unreserve step.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_bmap_util.c | 9 +++------ fs/xfs/xfs_iomap.c | 4 +--- fs/xfs/xfs_quota.h | 2 -- fs/xfs/xfs_reflink.c | 5 +---- 4 files changed, 5 insertions(+), 15 deletions(-)
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 597fa5052798..6b547be6f5c4 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -820,7 +820,7 @@ xfs_alloc_file_space( error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag); if (error) - goto error1; + goto error;
xfs_trans_ijoin(tp, ip, 0);
@@ -828,7 +828,7 @@ xfs_alloc_file_space( allocatesize_fsb, alloc_type, 0, imapp, &nimaps); if (error) - goto error0; + goto error;
/* * Complete the transaction @@ -851,10 +851,7 @@ xfs_alloc_file_space(
return error;
-error0: /* unlock inode, unreserve quota blocks, cancel trans */ - xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); - -error1: /* Just cancel transaction */ +error: xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 6c397e9e8674..268fee862632 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -260,7 +260,7 @@ xfs_iomap_write_direct( error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flags, 0, imap, &nimaps); if (error) - goto out_res_cancel; + goto out_trans_cancel;
/* * Complete the transaction @@ -284,8 +284,6 @@ xfs_iomap_write_direct( xfs_iunlock(ip, XFS_ILOCK_EXCL); return error;
-out_res_cancel: - xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); out_trans_cancel: xfs_trans_cancel(tp); goto out_unlock; diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 1d1a1634ea29..31d0de899cc4 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -164,8 +164,6 @@ xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks) #define xfs_qm_unmount_quotas(mp) #endif /* CONFIG_XFS_QUOTA */
-#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \ - xfs_trans_reserve_quota_nblks(tp, ip, -(nblks), -(ninos), flags) #define xfs_trans_reserve_quota(tp, mp, ud, gd, pd, nb, ni, f) \ xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, pd, nb, ni, \ f | XFS_QMOPT_RES_REGBLKS) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 256937a7c249..cbdde09031b6 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -411,7 +411,7 @@ xfs_reflink_allocate_cow( XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, cmap, &nimaps); if (error) - goto out_unreserve; + goto out_trans_cancel;
xfs_inode_set_cowblocks_tag(ip); error = xfs_trans_commit(tp); @@ -436,9 +436,6 @@ xfs_reflink_allocate_cow( trace_xfs_reflink_convert_cow(ip, cmap); return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
-out_unreserve: - xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0, - XFS_QMOPT_RES_REGBLKS); out_trans_cancel: xfs_trans_cancel(tp); return error;
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit ad4a74739708e193c21245dae908ff50f72ff207 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Create a proper helper so that inode creation calls can reserve quota with a dedicated function.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Chaitanya Kulkarni chaitanya.kulkarni@wdc.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_inode.c | 6 ++---- fs/xfs/xfs_quota.h | 14 ++++++++++---- fs/xfs/xfs_symlink.c | 3 +-- fs/xfs/xfs_trans_dquot.c | 18 ++++++++++++++++++ 4 files changed, 31 insertions(+), 10 deletions(-)
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 36c5d212bdde..fe41492b80d5 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1172,8 +1172,7 @@ xfs_create( /* * Reserve disk quota and the inode. */ - error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, - pdqp, resblks, 1, 0); + error = xfs_trans_reserve_quota_icreate(tp, udqp, gdqp, pdqp, resblks); if (error) goto out_trans_cancel;
@@ -1299,8 +1298,7 @@ xfs_create_tmpfile( if (error) goto out_release_inode;
- error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, - pdqp, resblks, 1, 0); + error = xfs_trans_reserve_quota_icreate(tp, udqp, gdqp, pdqp, resblks); if (error) goto out_trans_cancel;
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 31d0de899cc4..919c3a924821 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -86,6 +86,9 @@ extern int xfs_trans_reserve_quota_nblks(struct xfs_trans *, extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *, struct xfs_mount *, struct xfs_dquot *, struct xfs_dquot *, struct xfs_dquot *, int64_t, long, uint); +int xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, + struct xfs_dquot *udqp, struct xfs_dquot *gdqp, + struct xfs_dquot *pdqp, int64_t dblocks);
extern int xfs_qm_vop_dqalloc(struct xfs_inode *, kuid_t, kgid_t, prid_t, uint, struct xfs_dquot **, struct xfs_dquot **, @@ -149,6 +152,13 @@ xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks) return 0; }
+static inline int +xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, int64_t dblocks) +{ + return 0; +} + #define xfs_qm_vop_create_dqattach(tp, ip, u, g, p) #define xfs_qm_vop_rename_dqattach(it) (0) #define xfs_qm_vop_chown(tp, ip, old, new) (NULL) @@ -164,10 +174,6 @@ xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks) #define xfs_qm_unmount_quotas(mp) #endif /* CONFIG_XFS_QUOTA */
-#define xfs_trans_reserve_quota(tp, mp, ud, gd, pd, nb, ni, f) \ - xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, pd, nb, ni, \ - f | XFS_QMOPT_RES_REGBLKS) - static inline int xfs_quota_unreserve_blkres(struct xfs_inode *ip, int64_t blocks) { diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 8e88a7ca387e..3ccf77a126b9 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -215,8 +215,7 @@ xfs_symlink( /* * Reserve disk quota : blocks and inode. */ - error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, - pdqp, resblks, 1, 0); + error = xfs_trans_reserve_quota_icreate(tp, udqp, gdqp, pdqp, resblks); if (error) goto out_trans_cancel;
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index fe45b0c3970c..f18a2b80486e 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -825,6 +825,24 @@ xfs_trans_reserve_quota_nblks( nblks, ninos, flags); }
+/* Change the quota reservations for an inode creation activity. */ +int +xfs_trans_reserve_quota_icreate( + struct xfs_trans *tp, + struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, + struct xfs_dquot *pdqp, + int64_t dblocks) +{ + struct xfs_mount *mp = tp->t_mountp; + + if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + return 0; + + return xfs_trans_reserve_quota_bydquots(tp, mp, udqp, gdqp, pdqp, + dblocks, 1, XFS_QMOPT_RES_REGBLKS); +} + /* * This routine is called to allocate a quotaoff log item. */
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit 7ac6eb46c9f32d3e6ae37943191cd744ffa1ef33 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Fix some build warnings on gcc 10.2 when quotas are disabled.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_quota.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 919c3a924821..03235c184aab 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -130,7 +130,7 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid, } #define xfs_trans_dup_dqinfo(tp, tp2) #define xfs_trans_free_dqinfo(tp) -#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta) +#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta) do { } while (0) #define xfs_trans_apply_dquot_deltas(tp) #define xfs_trans_unreserve_and_mod_dquots(tp) static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, @@ -166,8 +166,8 @@ xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp, #define xfs_qm_dqattach(ip) (0) #define xfs_qm_dqattach_locked(ip, fl) (0) #define xfs_qm_dqdetach(ip) -#define xfs_qm_dqrele(d) -#define xfs_qm_statvfs(ip, s) +#define xfs_qm_dqrele(d) do { (d) = (d); } while(0) +#define xfs_qm_statvfs(ip, s) do { } while(0) #define xfs_qm_newmount(mp, a, b) (0) #define xfs_qm_mount_quotas(mp) #define xfs_qm_unmount(mp)
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit 02b7ee4eb613240d2bb3f6a67723f94ceda19eb6 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Modify xfs_trans_reserve_quota_nblks so that we can reserve data and realtime blocks from the dquot at the same time. This change has the theoretical side effect that for allocations to realtime files we will reserve from the dquot both the number of rtblocks being allocated and the number of bmbt blocks that might be needed to add the mapping. However, since the mount code disables quota if it finds a realtime device, this should not result in any behavior changes.
Now that we've moved the inode creation callers away from using the _nblks function, we can repurpose the (now unused) ninos argument for realtime blocks, so make that change. This also replaces the flags argument with a boolean parameter to force the reservation since we don't need to distinguish between data and rt quota reservations any more, and the only flag being passed in was FORCE_RES.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/libxfs/xfs_attr.c | 6 +----- fs/xfs/libxfs/xfs_bmap.c | 4 +--- fs/xfs/xfs_bmap_util.c | 24 +++++++++++------------- fs/xfs/xfs_iomap.c | 26 +++++++++++++------------- fs/xfs/xfs_quota.h | 10 +++++----- fs/xfs/xfs_reflink.c | 6 ++---- fs/xfs/xfs_trans_dquot.c | 40 +++++++++++++++++++++++++++------------- 7 files changed, 60 insertions(+), 56 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 96ac7e562b87..3c36e998b0c4 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -461,12 +461,8 @@ xfs_attr_set( xfs_ilock(dp, XFS_ILOCK_EXCL); xfs_trans_ijoin(args->trans, dp, 0); if (args->value) { - unsigned int quota_flags = XFS_QMOPT_RES_REGBLKS; - - if (rsvd) - quota_flags |= XFS_QMOPT_FORCE_RES; error = xfs_trans_reserve_quota_nblks(args->trans, dp, - args->total, 0, quota_flags); + args->total, 0, rsvd); if (error) goto out_trans_cancel;
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 79f976349a35..9d51787933c6 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1084,9 +1084,7 @@ xfs_bmap_add_attrfork( return error;
xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ? - XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : - XFS_QMOPT_RES_REGBLKS); + error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd); if (error) goto trans_cancel; if (XFS_IFORK_Q(ip)) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 6b547be6f5c4..dceca50d7536 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -727,11 +727,10 @@ xfs_alloc_file_space( xfs_fileoff_t startoffset_fsb; xfs_fileoff_t endoffset_fsb; int nimaps; - int quota_flag; int rt; xfs_trans_t *tp; xfs_bmbt_irec_t imaps[1], *imapp; - uint qblocks, resblks, resrtextents; + uint resblks, resrtextents; int error;
trace_xfs_alloc_file_space(ip); @@ -761,6 +760,7 @@ xfs_alloc_file_space( */ while (allocatesize_fsb && !error) { xfs_fileoff_t s, e; + unsigned int dblocks, rblocks;
/* * Determine space reservations for data/realtime. @@ -790,20 +790,19 @@ xfs_alloc_file_space( */ resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps)); if (unlikely(rt)) { - resrtextents = qblocks = resblks; + resrtextents = resblks; resrtextents /= mp->m_sb.sb_rextsize; - resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); - quota_flag = XFS_QMOPT_RES_RTBLKS; + dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + rblocks = resblks; } else { - resrtextents = 0; - resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks); - quota_flag = XFS_QMOPT_RES_REGBLKS; + dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks); + rblocks = 0; }
/* * Allocate and setup the transaction. */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, dblocks, resrtextents, 0, &tp);
/* @@ -817,8 +816,8 @@ xfs_alloc_file_space( break; } xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, - 0, quota_flag); + error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks, + false); if (error) goto error;
@@ -876,8 +875,7 @@ xfs_unmap_extent( }
xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, - XFS_QMOPT_RES_REGBLKS); + error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, false); if (error) goto out_trans_cancel;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 268fee862632..700ddc45545c 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -194,25 +194,25 @@ xfs_iomap_write_direct( struct xfs_trans *tp; xfs_filblks_t resaligned; int nimaps; - int quota_flag; - uint qblocks, resblks; + unsigned int dblocks, rblocks; unsigned int resrtextents = 0; int error; int bmapi_flags = XFS_BMAPI_PREALLOC; - uint tflags = 0; + int tflags = 0; + bool force = false;
ASSERT(count_fsb > 0);
resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb, xfs_get_extsz_hint(ip)); if (unlikely(XFS_IS_REALTIME_INODE(ip))) { - resrtextents = qblocks = resaligned; + resrtextents = resaligned; resrtextents /= mp->m_sb.sb_rextsize; - resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); - quota_flag = XFS_QMOPT_RES_RTBLKS; + dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + rblocks = resaligned; } else { - resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); - quota_flag = XFS_QMOPT_RES_REGBLKS; + dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); + rblocks = 0; }
error = xfs_qm_dqattach(ip); @@ -235,18 +235,19 @@ xfs_iomap_write_direct( if (IS_DAX(VFS_I(ip))) { bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO; if (imap->br_state == XFS_EXT_UNWRITTEN) { + force = true; tflags |= XFS_TRANS_RESERVE; - resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; + dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; } } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, resrtextents, + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, dblocks, resrtextents, tflags, &tp); if (error) return error;
xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag); + error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks, force); if (error) goto out_trans_cancel;
@@ -554,8 +555,7 @@ xfs_iomap_write_unwritten( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0);
- error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, - XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES); + error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, true); if (error) goto error_on_bmapi_transaction;
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 03235c184aab..6ddc4b358ede 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -81,8 +81,8 @@ extern void xfs_trans_mod_dquot_byino(struct xfs_trans *, struct xfs_inode *, uint, int64_t); extern void xfs_trans_apply_dquot_deltas(struct xfs_trans *); extern void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *); -extern int xfs_trans_reserve_quota_nblks(struct xfs_trans *, - struct xfs_inode *, int64_t, long, uint); +int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, struct xfs_inode *ip, + int64_t dblocks, int64_t rblocks, bool force); extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *, struct xfs_mount *, struct xfs_dquot *, struct xfs_dquot *, struct xfs_dquot *, int64_t, long, uint); @@ -114,8 +114,7 @@ extern void xfs_qm_unmount_quotas(struct xfs_mount *); static inline int xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks) { - return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, - XFS_QMOPT_RES_REGBLKS); + return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, false); } #else static inline int @@ -134,7 +133,8 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid, #define xfs_trans_apply_dquot_deltas(tp) #define xfs_trans_unreserve_and_mod_dquots(tp) static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, - struct xfs_inode *ip, int64_t nblks, long ninos, uint flags) + struct xfs_inode *ip, int64_t dblocks, int64_t rblocks, + bool force) { return 0; } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index cbdde09031b6..bcf0d349d3fc 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -398,8 +398,7 @@ xfs_reflink_allocate_cow( goto convert; }
- error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, - XFS_QMOPT_RES_REGBLKS); + error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, false); if (error) goto out_trans_cancel;
@@ -1084,8 +1083,7 @@ xfs_reflink_remap_extent( if (!smap_real && dmap_written) qres += dmap->br_blockcount; if (qres > 0) { - error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0, - XFS_QMOPT_RES_REGBLKS); + error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0, false); if (error) goto out_cancel; } diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index f18a2b80486e..520b24994813 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -801,28 +801,42 @@ int xfs_trans_reserve_quota_nblks( struct xfs_trans *tp, struct xfs_inode *ip, - int64_t nblks, - long ninos, - uint flags) + int64_t dblocks, + int64_t rblocks, + bool force) { struct xfs_mount *mp = ip->i_mount; + unsigned int qflags = 0; + int error;
if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) return 0;
ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino)); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT((flags & ~(XFS_QMOPT_FORCE_RES)) == XFS_TRANS_DQ_RES_RTBLKS || - (flags & ~(XFS_QMOPT_FORCE_RES)) == XFS_TRANS_DQ_RES_BLKS);
- /* - * Reserve nblks against these dquots, with trans as the mediator. - */ - return xfs_trans_reserve_quota_bydquots(tp, mp, - ip->i_udquot, ip->i_gdquot, - ip->i_pdquot, - nblks, ninos, flags); + if (force) + qflags |= XFS_QMOPT_FORCE_RES; + + /* Reserve data device quota against the inode's dquots. */ + error = xfs_trans_reserve_quota_bydquots(tp, mp, ip->i_udquot, + ip->i_gdquot, ip->i_pdquot, dblocks, 0, + XFS_QMOPT_RES_REGBLKS | qflags); + if (error) + return error; + + /* Do the same but for realtime blocks. */ + error = xfs_trans_reserve_quota_bydquots(tp, mp, ip->i_udquot, + ip->i_gdquot, ip->i_pdquot, rblocks, 0, + XFS_QMOPT_RES_RTBLKS | qflags); + if (error) { + xfs_trans_reserve_quota_bydquots(tp, mp, ip->i_udquot, + ip->i_gdquot, ip->i_pdquot, -dblocks, 0, + XFS_QMOPT_RES_REGBLKS); + return error; + } + + return 0; }
/* Change the quota reservations for an inode creation activity. */
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit 3a1af6c317d0a55524f39079183be107be4c1f39 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Create a new helper xfs_trans_alloc_inode that allocates a transaction, locks and joins an inode to it, and then reserves the appropriate amount of quota against that transction. Then replace all the open-coded idioms with a single call to this helper.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/libxfs/xfs_attr.c | 10 +-------- fs/xfs/libxfs/xfs_bmap.c | 10 ++------- fs/xfs/xfs_bmap_util.c | 14 +++--------- fs/xfs/xfs_iomap.c | 10 ++------- fs/xfs/xfs_trans.c | 48 ++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_trans.h | 3 +++ 6 files changed, 59 insertions(+), 36 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 3c36e998b0c4..1e29fc115034 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -453,19 +453,11 @@ xfs_attr_set( * Root fork attributes can use reserved data blocks for this * operation if necessary */ - error = xfs_trans_alloc(mp, &tres, total, 0, - rsvd ? XFS_TRANS_RESERVE : 0, &args->trans); + error = xfs_trans_alloc_inode(dp, &tres, total, rsvd, &args->trans); if (error) return error;
- xfs_ilock(dp, XFS_ILOCK_EXCL); - xfs_trans_ijoin(args->trans, dp, 0); if (args->value) { - error = xfs_trans_reserve_quota_nblks(args->trans, dp, - args->total, 0, rsvd); - if (error) - goto out_trans_cancel; - error = xfs_has_attr(args); if (error == -EEXIST && (args->attr_flags & XATTR_CREATE)) goto out_trans_cancel; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 9d51787933c6..84bb3813afd7 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1078,19 +1078,13 @@ xfs_bmap_add_attrfork(
blks = XFS_ADDAFORK_SPACE_RES(mp);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_addafork, blks, 0, - rsvd ? XFS_TRANS_RESERVE : 0, &tp); + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_addafork, blks, + rsvd, &tp); if (error) return error; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd); - if (error) - goto trans_cancel; if (XFS_IFORK_Q(ip)) goto trans_cancel;
- xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); error = xfs_bmap_set_attrforkoff(ip, size, &version); if (error) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index dceca50d7536..323791539b2f 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -868,18 +868,10 @@ xfs_unmap_extent( uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); int error;
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); - if (error) { - ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); - return error; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, false); + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, + false, &tp); if (error) - goto out_trans_cancel; - - xfs_trans_ijoin(tp, ip, 0); + return error;
error = xfs_bunmapi(tp, ip, startoffset_fsb, len_fsb, 0, 2, done); if (error) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 700ddc45545c..5a87c5372db0 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -547,17 +547,11 @@ xfs_iomap_write_unwritten( * here as we might be asked to write out the same inode that we * complete here and might deadlock on the iolock. */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, - XFS_TRANS_RESERVE, &tp); + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, + true, &tp); if (error) return error;
- xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); - - error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, true); - if (error) - goto error_on_bmapi_transaction;
/* * Modify the unwritten extent state of the buffer. diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index c19807abbefa..a36b2c133f93 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -20,6 +20,7 @@ #include "xfs_trace.h" #include "xfs_error.h" #include "xfs_defer.h" +#include "xfs_inode.h"
kmem_zone_t *xfs_trans_zone;
@@ -1020,3 +1021,50 @@ xfs_trans_roll( tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; return xfs_trans_reserve(*tpp, &tres, 0, 0); } + +/* + * Allocate an transaction, lock and join the inode to it, and reserve quota. + * + * The caller must ensure that the on-disk dquots attached to this inode have + * already been allocated and initialized. The caller is responsible for + * releasing ILOCK_EXCL if a new transaction is returned. + */ +int +xfs_trans_alloc_inode( + struct xfs_inode *ip, + struct xfs_trans_res *resv, + unsigned int dblocks, + bool force, + struct xfs_trans **tpp) +{ + struct xfs_trans *tp; + struct xfs_mount *mp = ip->i_mount; + int error; + + error = xfs_trans_alloc(mp, resv, dblocks, 0, + force ? XFS_TRANS_RESERVE : 0, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + error = xfs_qm_dqattach_locked(ip, false); + if (error) { + /* Caller should have allocated the dquots! */ + ASSERT(error != -ENOENT); + goto out_cancel; + } + + error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, 0, force); + if (error) + goto out_cancel; + + *tpp = tp; + return 0; + +out_cancel: + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index f4159326411a..920df48561df 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -253,4 +253,7 @@ xfs_trans_item_relog( return lip->li_ops->iop_relog(lip, tp); }
+int xfs_trans_alloc_inode(struct xfs_inode *ip, struct xfs_trans_res *resv, + unsigned int dblocks, bool force, struct xfs_trans **tpp); + #endif /* __XFS_TRANS_H__ */
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit 3de4eb106fcc97f086b78bd17a0c3529691e8259 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Make it so that we can reserve rt blocks with the xfs_trans_alloc_inode wrapper function, then convert a few more callsites.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/libxfs/xfs_attr.c | 2 +- fs/xfs/libxfs/xfs_bmap.c | 2 +- fs/xfs/xfs_bmap_util.c | 28 +++++----------------------- fs/xfs/xfs_iomap.c | 22 +++++----------------- fs/xfs/xfs_trans.c | 6 ++++-- fs/xfs/xfs_trans.h | 3 ++- 6 files changed, 18 insertions(+), 45 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 1e29fc115034..60cf08e0cb5e 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -453,7 +453,7 @@ xfs_attr_set( * Root fork attributes can use reserved data blocks for this * operation if necessary */ - error = xfs_trans_alloc_inode(dp, &tres, total, rsvd, &args->trans); + error = xfs_trans_alloc_inode(dp, &tres, total, 0, rsvd, &args->trans); if (error) return error;
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 84bb3813afd7..3a77db5a6ce8 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1078,7 +1078,7 @@ xfs_bmap_add_attrfork(
blks = XFS_ADDAFORK_SPACE_RES(mp);
- error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_addafork, blks, + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_addafork, blks, 0, rsvd, &tp); if (error) return error; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 323791539b2f..d0113dec5d32 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -730,7 +730,6 @@ xfs_alloc_file_space( int rt; xfs_trans_t *tp; xfs_bmbt_irec_t imaps[1], *imapp; - uint resblks, resrtextents; int error;
trace_xfs_alloc_file_space(ip); @@ -760,7 +759,7 @@ xfs_alloc_file_space( */ while (allocatesize_fsb && !error) { xfs_fileoff_t s, e; - unsigned int dblocks, rblocks; + unsigned int dblocks, rblocks, resblks;
/* * Determine space reservations for data/realtime. @@ -790,8 +789,6 @@ xfs_alloc_file_space( */ resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps)); if (unlikely(rt)) { - resrtextents = resblks; - resrtextents /= mp->m_sb.sb_rextsize; dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0); rblocks = resblks; } else { @@ -802,26 +799,11 @@ xfs_alloc_file_space( /* * Allocate and setup the transaction. */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, dblocks, - resrtextents, 0, &tp); - - /* - * Check for running out of space - */ - if (error) { - /* - * Free the transaction structure. - */ - ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); - break; - } - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks, - false); + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, + dblocks, rblocks, false, &tp); if (error) - goto error; + break;
- xfs_trans_ijoin(tp, ip, 0);
error = xfs_bmapi_write(tp, ip, startoffset_fsb, allocatesize_fsb, alloc_type, 0, imapp, @@ -868,7 +850,7 @@ xfs_unmap_extent( uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); int error;
- error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0, false, &tp); if (error) return error; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 5a87c5372db0..6a28c3e78e52 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -195,19 +195,15 @@ xfs_iomap_write_direct( xfs_filblks_t resaligned; int nimaps; unsigned int dblocks, rblocks; - unsigned int resrtextents = 0; + bool force = false; int error; int bmapi_flags = XFS_BMAPI_PREALLOC; - int tflags = 0; - bool force = false;
ASSERT(count_fsb > 0);
resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb, xfs_get_extsz_hint(ip)); if (unlikely(XFS_IS_REALTIME_INODE(ip))) { - resrtextents = resaligned; - resrtextents /= mp->m_sb.sb_rextsize; dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0); rblocks = resaligned; } else { @@ -236,22 +232,14 @@ xfs_iomap_write_direct( bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO; if (imap->br_state == XFS_EXT_UNWRITTEN) { force = true; - tflags |= XFS_TRANS_RESERVE; dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; } } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, dblocks, resrtextents, - tflags, &tp); - if (error) - return error;
- xfs_ilock(ip, XFS_ILOCK_EXCL); - - error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks, force); + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks, + rblocks, force, &tp); if (error) - goto out_trans_cancel; - - xfs_trans_ijoin(tp, ip, 0); + return error;
/* * From this point onwards we overwrite the imap pointer that the @@ -548,7 +536,7 @@ xfs_iomap_write_unwritten( * complete here and might deadlock on the iolock. */ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, - true, &tp); + 0, true, &tp); if (error) return error;
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index a36b2c133f93..5aee7e801a22 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1034,6 +1034,7 @@ xfs_trans_alloc_inode( struct xfs_inode *ip, struct xfs_trans_res *resv, unsigned int dblocks, + unsigned int rblocks, bool force, struct xfs_trans **tpp) { @@ -1041,7 +1042,8 @@ xfs_trans_alloc_inode( struct xfs_mount *mp = ip->i_mount; int error;
- error = xfs_trans_alloc(mp, resv, dblocks, 0, + error = xfs_trans_alloc(mp, resv, dblocks, + rblocks / mp->m_sb.sb_rextsize, force ? XFS_TRANS_RESERVE : 0, &tp); if (error) return error; @@ -1056,7 +1058,7 @@ xfs_trans_alloc_inode( goto out_cancel; }
- error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, 0, force); + error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks, force); if (error) goto out_cancel;
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 920df48561df..d9a964bcc23b 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -254,6 +254,7 @@ xfs_trans_item_relog( }
int xfs_trans_alloc_inode(struct xfs_inode *ip, struct xfs_trans_res *resv, - unsigned int dblocks, bool force, struct xfs_trans **tpp); + unsigned int dblocks, unsigned int rblocks, bool force, + struct xfs_trans **tpp);
#endif /* __XFS_TRANS_H__ */
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit f273387b048543f2b8b2d809cc65fca28e7788a1 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
The two remaining callers of xfs_trans_reserve_quota_nblks are in the reflink code. These conversions aren't as uniform as the previous conversions, so call that out in a separate patch.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_iomap.c | 3 ++- fs/xfs/xfs_reflink.c | 53 ++++++++++++++++++-------------------------- 2 files changed, 23 insertions(+), 33 deletions(-)
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 6a28c3e78e52..36ed2901f395 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -822,7 +822,8 @@ xfs_direct_write_iomap_begin( return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED);
out_unlock: - xfs_iunlock(ip, lockmode); + if (lockmode) + xfs_iunlock(ip, lockmode); return error; }
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index bcf0d349d3fc..20e7b1706718 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -376,16 +376,14 @@ xfs_reflink_allocate_cow( resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
xfs_iunlock(ip, *lockmode); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); - *lockmode = XFS_ILOCK_EXCL; - xfs_ilock(ip, *lockmode); + *lockmode = 0;
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0, + false, &tp); if (error) return error;
- error = xfs_qm_dqattach_locked(ip, false); - if (error) - goto out_trans_cancel; + *lockmode = XFS_ILOCK_EXCL;
/* * Check for an overlapping extent again now that we dropped the ilock. @@ -398,12 +396,6 @@ xfs_reflink_allocate_cow( goto convert; }
- error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, false); - if (error) - goto out_trans_cancel; - - xfs_trans_ijoin(tp, ip, 0); - /* Allocate the entire reservation as unwritten blocks. */ nimaps = 1; error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount, @@ -992,22 +984,29 @@ xfs_reflink_remap_extent( struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; xfs_off_t newlen; - int64_t qres, qdelta; + int64_t qdelta = 0; unsigned int resblks; bool smap_real; bool dmap_written = xfs_bmap_is_written_extent(dmap); int nimaps; int error;
- /* Start a rolling transaction to switch the mappings */ + /* + * Start a rolling transaction to switch the mappings. + * + * Adding a written extent to the extent map can cause a bmbt split, + * and removing a mapped extent from the extent can cause a bmbt split. + * The two operations cannot both cause a split since they operate on + * the same index in the bmap btree, so we only need a reservation for + * one bmbt split if either thing is happening. However, we haven't + * locked the inode yet, so we reserve assuming this is the case. + */ resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0, + false, &tp); if (error) goto out;
- xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); - /* * Read what's currently mapped in the destination file into smap. * If smap isn't a hole, we will have to remove it before we can add @@ -1055,15 +1054,9 @@ xfs_reflink_remap_extent( }
/* - * Compute quota reservation if we think the quota block counter for + * Increase quota reservation if we think the quota block counter for * this file could increase. * - * Adding a written extent to the extent map can cause a bmbt split, - * and removing a mapped extent from the extent can cause a bmbt split. - * The two operations cannot both cause a split since they operate on - * the same index in the bmap btree, so we only need a reservation for - * one bmbt split if either thing is happening. - * * If we are mapping a written extent into the file, we need to have * enough quota block count reservation to handle the blocks in that * extent. We log only the delta to the quota block counts, so if the @@ -1077,13 +1070,9 @@ xfs_reflink_remap_extent( * before we started. That should have removed all the delalloc * reservations, but we code defensively. */ - qres = qdelta = 0; - if (smap_real || dmap_written) - qres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); - if (!smap_real && dmap_written) - qres += dmap->br_blockcount; - if (qres > 0) { - error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0, false); + if (!smap_real && dmap_written) { + error = xfs_trans_reserve_quota_nblks(tp, ip, + dmap->br_blockcount, 0, false); if (error) goto out_cancel; }
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit f2f7b9ff62a28928f6fe2bd55cdb4d4b02ab7477 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
For file creation, create a new helper xfs_trans_alloc_icreate that allocates a transaction and reserves the appropriate amount of quota against that transction. Replace all the open-coded idioms with a single call to this helper so that we can contain the retry loops in the next patchset.
This changes the locking behavior for non-tempfile creation slightly, in that we now make the quota reservation without holding the directory ILOCK. While the dquots chosen for inode creation are based on the directory state at a given point in time, the directory ILOCK was released as soon as the dquot references are picked up. Hence it was never necessary to hold the directory ILOCK for the quota reservation.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_inode.c | 28 ++++++++++------------------ fs/xfs/xfs_symlink.c | 14 ++++---------- fs/xfs/xfs_trans.c | 33 +++++++++++++++++++++++++++++++++ fs/xfs/xfs_trans.h | 6 ++++++ 4 files changed, 53 insertions(+), 28 deletions(-)
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index fe41492b80d5..5983eee137bd 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1157,25 +1157,20 @@ xfs_create( * the case we'll drop the one we have and get a more * appropriate transaction later. */ - error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp); + error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks, + &tp); if (error == -ENOSPC) { /* flush outstanding delalloc blocks and retry */ xfs_flush_inodes(mp); - error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp); + error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, + resblks, &tp); } if (error) - goto out_release_inode; + goto out_release_dquots;
xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = true;
- /* - * Reserve disk quota and the inode. - */ - error = xfs_trans_reserve_quota_icreate(tp, udqp, gdqp, pdqp, resblks); - if (error) - goto out_trans_cancel; - /* * A newly created regular or special file just has one directory * entry pointing to them, but a directory also the "." entry @@ -1250,7 +1245,7 @@ xfs_create( xfs_finish_inode_setup(ip); xfs_irele(ip); } - + out_release_dquots: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); xfs_qm_dqrele(pdqp); @@ -1294,13 +1289,10 @@ xfs_create_tmpfile( resblks = XFS_IALLOC_SPACE_RES(mp); tres = &M_RES(mp)->tr_create_tmpfile;
- error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp); - if (error) - goto out_release_inode; - - error = xfs_trans_reserve_quota_icreate(tp, udqp, gdqp, pdqp, resblks); + error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks, + &tp); if (error) - goto out_trans_cancel; + goto out_release_dquots;
error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, &ip); if (error) @@ -1343,7 +1335,7 @@ xfs_create_tmpfile( xfs_finish_inode_setup(ip); xfs_irele(ip); } - + out_release_dquots: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); xfs_qm_dqrele(pdqp); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 3ccf77a126b9..f614e28f42d5 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -197,9 +197,10 @@ xfs_symlink( fs_blocks = xfs_symlink_blocks(mp, pathlen); resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, resblks, 0, 0, &tp); + error = xfs_trans_alloc_icreate(mp, &M_RES(mp)->tr_symlink, udqp, gdqp, + pdqp, resblks, &tp); if (error) - goto out_release_inode; + goto out_release_dquots;
xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = true; @@ -212,13 +213,6 @@ xfs_symlink( goto out_trans_cancel; }
- /* - * Reserve disk quota : blocks and inode. - */ - error = xfs_trans_reserve_quota_icreate(tp, udqp, gdqp, pdqp, resblks); - if (error) - goto out_trans_cancel; - /* * Allocate an inode for the symlink. */ @@ -341,7 +335,7 @@ xfs_symlink( xfs_finish_inode_setup(ip); xfs_irele(ip); } - +out_release_dquots: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); xfs_qm_dqrele(pdqp); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 5aee7e801a22..61604f5ed4b8 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -21,6 +21,8 @@ #include "xfs_error.h" #include "xfs_defer.h" #include "xfs_inode.h" +#include "xfs_dquot_item.h" +#include "xfs_dquot.h"
kmem_zone_t *xfs_trans_zone;
@@ -1070,3 +1072,34 @@ xfs_trans_alloc_inode( xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } + +/* + * Allocate an transaction in preparation for inode creation by reserving quota + * against the given dquots. Callers are not required to hold any inode locks. + */ +int +xfs_trans_alloc_icreate( + struct xfs_mount *mp, + struct xfs_trans_res *resv, + struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, + struct xfs_dquot *pdqp, + unsigned int dblocks, + struct xfs_trans **tpp) +{ + struct xfs_trans *tp; + int error; + + error = xfs_trans_alloc(mp, resv, dblocks, 0, 0, &tp); + if (error) + return error; + + error = xfs_trans_reserve_quota_icreate(tp, udqp, gdqp, pdqp, dblocks); + if (error) { + xfs_trans_cancel(tp); + return error; + } + + *tpp = tp; + return 0; +} diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index d9a964bcc23b..5a73e8572f74 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -253,8 +253,14 @@ xfs_trans_item_relog( return lip->li_ops->iop_relog(lip, tp); }
+struct xfs_dquot; + int xfs_trans_alloc_inode(struct xfs_inode *ip, struct xfs_trans_res *resv, unsigned int dblocks, unsigned int rblocks, bool force, struct xfs_trans **tpp); +int xfs_trans_alloc_icreate(struct xfs_mount *mp, struct xfs_trans_res *resv, + struct xfs_dquot *udqp, struct xfs_dquot *gdqp, + struct xfs_dquot *pdqp, unsigned int dblocks, + struct xfs_trans **tpp);
#endif /* __XFS_TRANS_H__ */
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit 7317a03df703f7cdae3ae9e9635a0ef45849fe09 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
For file ownership (uid, gid, prid) changes, create a new helper xfs_trans_alloc_ichange that allocates a transaction and reserves the appropriate amount of quota against that transction in preparation for a change of user, group, or project id. Replace all the open-coded idioms with a single call to this helper so that we can contain the retry loops in the next patchset.
This changes the locking behavior for ichange transactions slightly. Since tr_ichange does not have a permanent reservation and cannot roll, we pass XFS_ILOCK_EXCL to ijoin so that the inode will be unlocked automatically at commit time.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_ioctl.c | 29 ++++++++-------------- fs/xfs/xfs_iops.c | 26 ++----------------- fs/xfs/xfs_trans.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_trans.h | 3 +++ 4 files changed, 77 insertions(+), 43 deletions(-)
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index f6bb581edc6c..34c982bdb200 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1275,24 +1275,23 @@ xfs_ioctl_setattr_prepare_dax( */ static struct xfs_trans * xfs_ioctl_setattr_get_trans( - struct xfs_inode *ip) + struct xfs_inode *ip, + struct xfs_dquot *pdqp) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; int error = -EROFS;
if (mp->m_flags & XFS_MOUNT_RDONLY) - goto out_unlock; + goto out_error; error = -EIO; if (XFS_FORCED_SHUTDOWN(mp)) - goto out_unlock; + goto out_error;
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); + error = xfs_trans_alloc_ichange(ip, NULL, NULL, pdqp, + capable(CAP_FOWNER), &tp); if (error) - goto out_unlock; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + goto out_error;
/* * CAP_FOWNER overrides the following restrictions: @@ -1312,7 +1311,7 @@ xfs_ioctl_setattr_get_trans(
out_cancel: xfs_trans_cancel(tp); -out_unlock: +out_error: return ERR_PTR(error); }
@@ -1462,20 +1461,12 @@ xfs_ioctl_setattr(
xfs_ioctl_setattr_prepare_dax(ip, fa);
- tp = xfs_ioctl_setattr_get_trans(ip); + tp = xfs_ioctl_setattr_get_trans(ip, pdqp); if (IS_ERR(tp)) { code = PTR_ERR(tp); goto error_free_dquots; }
- if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) && - ip->i_d.di_projid != fa->fsx_projid) { - code = xfs_qm_vop_chown_reserve(tp, ip, NULL, NULL, pdqp, - capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0); - if (code) /* out of quota */ - goto error_trans_cancel; - } - xfs_fill_fsxattr(ip, false, &old_fa); code = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, fa); if (code) @@ -1608,7 +1599,7 @@ xfs_ioc_setxflags(
xfs_ioctl_setattr_prepare_dax(ip, &fa);
- tp = xfs_ioctl_setattr_get_trans(ip); + tp = xfs_ioctl_setattr_get_trans(ip, NULL); if (IS_ERR(tp)) { error = PTR_ERR(tp); goto out_drop_write; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 6a3026e78a9b..afacde25c9ed 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -703,13 +703,11 @@ xfs_setattr_nonsize( return error; }
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); + error = xfs_trans_alloc_ichange(ip, udqp, gdqp, NULL, + capable(CAP_FOWNER), &tp); if (error) goto out_dqrele;
- xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); - /* * Change file ownership. Must be the owner or privileged. */ @@ -725,21 +723,6 @@ xfs_setattr_nonsize( gid = (mask & ATTR_GID) ? iattr->ia_gid : igid; uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
- /* - * Do a quota reservation only if uid/gid is actually - * going to change. - */ - if (XFS_IS_QUOTA_RUNNING(mp) && - ((XFS_IS_UQUOTA_ON(mp) && !uid_eq(iuid, uid)) || - (XFS_IS_GQUOTA_ON(mp) && !gid_eq(igid, gid)))) { - ASSERT(tp); - error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, - NULL, capable(CAP_FOWNER) ? - XFS_QMOPT_FORCE_RES : 0); - if (error) /* out of quota */ - goto out_cancel; - } - /* * CAP_FSETID overrides the following restrictions: * @@ -789,8 +772,6 @@ xfs_setattr_nonsize( xfs_trans_set_sync(tp); error = xfs_trans_commit(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL); - /* * Release any dquot(s) the inode had kept before chown. */ @@ -817,9 +798,6 @@ xfs_setattr_nonsize(
return 0;
-out_cancel: - xfs_trans_cancel(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); out_dqrele: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 61604f5ed4b8..8a9fedea1a66 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1103,3 +1103,65 @@ xfs_trans_alloc_icreate( *tpp = tp; return 0; } + +/* + * Allocate an transaction, lock and join the inode to it, and reserve quota + * in preparation for inode attribute changes that include uid, gid, or prid + * changes. + * + * The caller must ensure that the on-disk dquots attached to this inode have + * already been allocated and initialized. The ILOCK will be dropped when the + * transaction is committed or cancelled. + */ +int +xfs_trans_alloc_ichange( + struct xfs_inode *ip, + struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, + struct xfs_dquot *pdqp, + bool force, + struct xfs_trans **tpp) +{ + struct xfs_trans *tp; + struct xfs_mount *mp = ip->i_mount; + int error; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + error = xfs_qm_dqattach_locked(ip, false); + if (error) { + /* Caller should have allocated the dquots! */ + ASSERT(error != -ENOENT); + goto out_cancel; + } + + /* + * For each quota type, skip quota reservations if the inode's dquots + * now match the ones that came from the caller, or the caller didn't + * pass one in. + */ + if (udqp == ip->i_udquot) + udqp = NULL; + if (gdqp == ip->i_gdquot) + gdqp = NULL; + if (pdqp == ip->i_pdquot) + pdqp = NULL; + if (udqp || gdqp || pdqp) { + error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, pdqp, + force ? XFS_QMOPT_FORCE_RES : 0); + if (error) + goto out_cancel; + } + + *tpp = tp; + return 0; + +out_cancel: + xfs_trans_cancel(tp); + return error; +} diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 5a73e8572f74..f95566fe981b 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -262,5 +262,8 @@ int xfs_trans_alloc_icreate(struct xfs_mount *mp, struct xfs_trans_res *resv, struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, unsigned int dblocks, struct xfs_trans **tpp); +int xfs_trans_alloc_ichange(struct xfs_inode *ip, struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, bool force, + struct xfs_trans **tpp);
#endif /* __XFS_TRANS_H__ */
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit 5c615f0feb9a559abd08da0842d6fcfee105b7e3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Now that the only caller of this function is xfs_trans_alloc_ichange, just open-code the meat of _chown_reserve in that caller. Drop the (redundant) [ugp]id checks because xfs has a 1:1 relationship between quota ids and incore dquots.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_qm.c | 48 ---------------------------------------------- fs/xfs/xfs_quota.h | 4 ---- fs/xfs/xfs_trans.c | 16 ++++++++++++++-- 3 files changed, 14 insertions(+), 54 deletions(-)
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 64e5da33733b..c660db6d5dde 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1817,54 +1817,6 @@ xfs_qm_vop_chown( return prevdq; }
-/* - * Quota reservations for setattr(AT_UID|AT_GID|AT_PROJID). - */ -int -xfs_qm_vop_chown_reserve( - struct xfs_trans *tp, - struct xfs_inode *ip, - struct xfs_dquot *udqp, - struct xfs_dquot *gdqp, - struct xfs_dquot *pdqp, - uint flags) -{ - struct xfs_mount *mp = ip->i_mount; - unsigned int blkflags; - struct xfs_dquot *udq_delblks = NULL; - struct xfs_dquot *gdq_delblks = NULL; - struct xfs_dquot *pdq_delblks = NULL; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - ASSERT(XFS_IS_QUOTA_RUNNING(mp)); - - blkflags = XFS_IS_REALTIME_INODE(ip) ? - XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS; - - if (XFS_IS_UQUOTA_ON(mp) && udqp && - i_uid_read(VFS_I(ip)) != udqp->q_id) - udq_delblks = udqp; - - if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp && - i_gid_read(VFS_I(ip)) != gdqp->q_id) - gdq_delblks = gdqp; - - if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp && - ip->i_d.di_projid != pdqp->q_id) - pdq_delblks = pdqp; - - /* - * Reserve enough quota to handle blocks on disk and reserved for a - * delayed allocation. We'll actually transfer the delalloc - * reservation between dquots at chown time, even though that part is - * only semi-transactional. - */ - return xfs_trans_reserve_quota_bydquots(tp, ip->i_mount, udq_delblks, - gdq_delblks, pdq_delblks, - ip->i_d.di_nblocks + ip->i_delayed_blks, - 1, blkflags | flags); -} - int xfs_qm_vop_rename_dqattach( struct xfs_inode **i_tab) diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 6ddc4b358ede..d00d01302545 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -98,9 +98,6 @@ extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *, extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **); extern struct xfs_dquot *xfs_qm_vop_chown(struct xfs_trans *, struct xfs_inode *, struct xfs_dquot **, struct xfs_dquot *); -extern int xfs_qm_vop_chown_reserve(struct xfs_trans *, struct xfs_inode *, - struct xfs_dquot *, struct xfs_dquot *, - struct xfs_dquot *, uint); extern int xfs_qm_dqattach(struct xfs_inode *); extern int xfs_qm_dqattach_locked(struct xfs_inode *ip, bool doalloc); extern void xfs_qm_dqdetach(struct xfs_inode *); @@ -162,7 +159,6 @@ xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp, #define xfs_qm_vop_create_dqattach(tp, ip, u, g, p) #define xfs_qm_vop_rename_dqattach(it) (0) #define xfs_qm_vop_chown(tp, ip, old, new) (NULL) -#define xfs_qm_vop_chown_reserve(tp, ip, u, g, p, fl) (0) #define xfs_qm_dqattach(ip) (0) #define xfs_qm_dqattach_locked(ip, fl) (0) #define xfs_qm_dqdetach(ip) diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 8a9fedea1a66..788a7d0fafaa 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1152,8 +1152,20 @@ xfs_trans_alloc_ichange( if (pdqp == ip->i_pdquot) pdqp = NULL; if (udqp || gdqp || pdqp) { - error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, pdqp, - force ? XFS_QMOPT_FORCE_RES : 0); + unsigned int qflags = XFS_QMOPT_RES_REGBLKS; + + if (force) + qflags |= XFS_QMOPT_FORCE_RES; + + /* + * Reserve enough quota to handle blocks on disk and reserved + * for a delayed allocation. We'll actually transfer the + * delalloc reservation between dquots at chown time, even + * though that part is only semi-transactional. + */ + error = xfs_trans_reserve_quota_bydquots(tp, mp, udqp, gdqp, + pdqp, ip->i_d.di_nblocks + ip->i_delayed_blks, + 1, qflags); if (error) goto out_cancel; }
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit fea7aae6cecfed1b6a520cc527d297df8801b999 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Rename the 'code' variable to 'error' to follow the naming convention of most other functions in xfs.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Chaitanya Kulkarni chaitanya.kulkarni@wdc.com Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_ioctl.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-)
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 34c982bdb200..cde305a15c84 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1435,13 +1435,13 @@ xfs_ioctl_setattr( struct xfs_trans *tp; struct xfs_dquot *pdqp = NULL; struct xfs_dquot *olddquot = NULL; - int code; + int error;
trace_xfs_ioctl_setattr(ip);
- code = xfs_ioctl_setattr_check_projid(ip, fa); - if (code) - return code; + error = xfs_ioctl_setattr_check_projid(ip, fa); + if (error) + return error;
/* * If disk quotas is on, we make sure that the dquots do exist on disk, @@ -1452,36 +1452,36 @@ xfs_ioctl_setattr( * because the i_*dquot fields will get updated anyway. */ if (XFS_IS_QUOTA_ON(mp)) { - code = xfs_qm_vop_dqalloc(ip, VFS_I(ip)->i_uid, + error = xfs_qm_vop_dqalloc(ip, VFS_I(ip)->i_uid, VFS_I(ip)->i_gid, fa->fsx_projid, XFS_QMOPT_PQUOTA, NULL, NULL, &pdqp); - if (code) - return code; + if (error) + return error; }
xfs_ioctl_setattr_prepare_dax(ip, fa);
tp = xfs_ioctl_setattr_get_trans(ip, pdqp); if (IS_ERR(tp)) { - code = PTR_ERR(tp); + error = PTR_ERR(tp); goto error_free_dquots; }
xfs_fill_fsxattr(ip, false, &old_fa); - code = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, fa); - if (code) + error = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, fa); + if (error) goto error_trans_cancel;
- code = xfs_ioctl_setattr_check_extsize(ip, fa); - if (code) + error = xfs_ioctl_setattr_check_extsize(ip, fa); + if (error) goto error_trans_cancel;
- code = xfs_ioctl_setattr_check_cowextsize(ip, fa); - if (code) + error = xfs_ioctl_setattr_check_cowextsize(ip, fa); + if (error) goto error_trans_cancel;
- code = xfs_ioctl_setattr_xflags(tp, ip, fa); - if (code) + error = xfs_ioctl_setattr_xflags(tp, ip, fa); + if (error) goto error_trans_cancel;
/* @@ -1521,7 +1521,7 @@ xfs_ioctl_setattr( else ip->i_d.di_cowextsize = 0;
- code = xfs_trans_commit(tp); + error = xfs_trans_commit(tp);
/* * Release any dquot(s) the inode had kept before chown. @@ -1529,13 +1529,13 @@ xfs_ioctl_setattr( xfs_qm_dqrele(olddquot); xfs_qm_dqrele(pdqp);
- return code; + return error;
error_trans_cancel: xfs_trans_cancel(tp); error_free_dquots: xfs_qm_dqrele(pdqp); - return code; + return error; }
STATIC int
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit 2a4bdfa8558ca2904dc17b83497dc82aa7fc05e9 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
If we ever screw up the quota reservations enough to trip the assertions, something's wrong with the quota code. Shut down the filesystem when this happens, because this is corruption.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_trans_dquot.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-)
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 520b24994813..52af4c7313eb 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -16,6 +16,7 @@ #include "xfs_quota.h" #include "xfs_qm.h" #include "xfs_trace.h" +#include "xfs_error.h"
STATIC void xfs_trans_alloc_dqinfo(xfs_trans_t *);
@@ -709,9 +710,11 @@ xfs_trans_dqresv( XFS_TRANS_DQ_RES_INOS, ninos); } - ASSERT(dqp->q_blk.reserved >= dqp->q_blk.count); - ASSERT(dqp->q_rtb.reserved >= dqp->q_rtb.count); - ASSERT(dqp->q_ino.reserved >= dqp->q_ino.count); + + if (XFS_IS_CORRUPT(mp, dqp->q_blk.reserved < dqp->q_blk.count) || + XFS_IS_CORRUPT(mp, dqp->q_rtb.reserved < dqp->q_rtb.count) || + XFS_IS_CORRUPT(mp, dqp->q_ino.reserved < dqp->q_ino.count)) + goto error_corrupt;
xfs_dqunlock(dqp); return 0; @@ -721,6 +724,10 @@ xfs_trans_dqresv( if (xfs_dquot_type(dqp) == XFS_DQTYPE_PROJ) return -ENOSPC; return -EDQUOT; +error_corrupt: + xfs_dqunlock(dqp); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return -EFSCORRUPTED; }
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit a636b1d1cf73804e385990c975e33cf06c032b64 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
The functions to run an eof/cowblocks scan to try to reduce quota usage are kind of a mess -- the logic repeatedly initializes an eofb structure and there are logic bugs in the code that result in the cowblocks scan never actually happening.
Replace all three functions with a single function that fills out an eofb and runs both eof and cowblocks scans.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_file.c | 15 ++++++--------- fs/xfs/xfs_icache.c | 46 ++++++++++++++++----------------------------- fs/xfs/xfs_icache.h | 4 ++-- 3 files changed, 24 insertions(+), 41 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index a66afc7ddfdd..b3e273c6c093 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -666,7 +666,7 @@ xfs_file_buffered_aio_write( struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); ssize_t ret; - int enospc = 0; + bool cleared_space = false; int iolock;
if (iocb->ki_flags & IOCB_NOWAIT) @@ -698,19 +698,16 @@ xfs_file_buffered_aio_write( * also behaves as a filter to prevent too many eofblocks scans from * running at the same time. */ - if (ret == -EDQUOT && !enospc) { + if (ret == -EDQUOT && !cleared_space) { xfs_iunlock(ip, iolock); - enospc = xfs_inode_free_quota_eofblocks(ip); - if (enospc) - goto write_retry; - enospc = xfs_inode_free_quota_cowblocks(ip); - if (enospc) + cleared_space = xfs_inode_free_quota_blocks(ip); + if (cleared_space) goto write_retry; iolock = 0; - } else if (ret == -ENOSPC && !enospc) { + } else if (ret == -ENOSPC && !cleared_space) { struct xfs_eofblocks eofb = {0};
- enospc = 1; + cleared_space = true; xfs_flush_inodes(ip->i_mount);
xfs_iunlock(ip, iolock); diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 39af75dd2bf6..2b87317120e1 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1397,33 +1397,31 @@ xfs_icache_free_eofblocks( }
/* - * Run eofblocks scans on the quotas applicable to the inode. For inodes with - * multiple quotas, we don't know exactly which quota caused an allocation + * Run cow/eofblocks scans on the quotas applicable to the inode. For inodes + * with multiple quotas, we don't know exactly which quota caused an allocation * failure. We make a best effort by including each quota under low free space * conditions (less than 1% free space) in the scan. */ -static int -__xfs_inode_free_quota_eofblocks( - struct xfs_inode *ip, - int (*execute)(struct xfs_mount *mp, - struct xfs_eofblocks *eofb)) +bool +xfs_inode_free_quota_blocks( + struct xfs_inode *ip) { - int scan = 0; - struct xfs_eofblocks eofb = {0}; - struct xfs_dquot *dq; + struct xfs_eofblocks eofb = {0}; + struct xfs_dquot *dq; + bool do_work = false;
/* * Run a sync scan to increase effectiveness and use the union filter to * cover all applicable quotas in a single scan. */ - eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC; + eofb.eof_flags = XFS_EOF_FLAGS_UNION | XFS_EOF_FLAGS_SYNC;
if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { dq = xfs_inode_dquot(ip, XFS_DQTYPE_USER); if (dq && xfs_dquot_lowsp(dq)) { eofb.eof_uid = VFS_I(ip)->i_uid; eofb.eof_flags |= XFS_EOF_FLAGS_UID; - scan = 1; + do_work = true; } }
@@ -1432,21 +1430,16 @@ __xfs_inode_free_quota_eofblocks( if (dq && xfs_dquot_lowsp(dq)) { eofb.eof_gid = VFS_I(ip)->i_gid; eofb.eof_flags |= XFS_EOF_FLAGS_GID; - scan = 1; + do_work = true; } }
- if (scan) - execute(ip->i_mount, &eofb); - - return scan; -} + if (!do_work) + return false;
-int -xfs_inode_free_quota_eofblocks( - struct xfs_inode *ip) -{ - return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_eofblocks); + xfs_icache_free_eofblocks(ip->i_mount, &eofb); + xfs_icache_free_cowblocks(ip->i_mount, &eofb); + return true; }
static inline unsigned long @@ -1646,13 +1639,6 @@ xfs_icache_free_cowblocks( XFS_ICI_COWBLOCKS_TAG); }
-int -xfs_inode_free_quota_cowblocks( - struct xfs_inode *ip) -{ - return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_cowblocks); -} - void xfs_inode_set_cowblocks_tag( xfs_inode_t *ip) diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 3a4c8b382cd0..3f7ddbca8638 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -54,17 +54,17 @@ long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
+bool xfs_inode_free_quota_blocks(struct xfs_inode *ip); + void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *); -int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip); void xfs_eofblocks_worker(struct work_struct *); void xfs_queue_eofblocks(struct xfs_mount *);
void xfs_inode_set_cowblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip); int xfs_icache_free_cowblocks(struct xfs_mount *, struct xfs_eofblocks *); -int xfs_inode_free_quota_cowblocks(struct xfs_inode *ip); void xfs_cowblocks_worker(struct work_struct *); void xfs_queue_cowblocks(struct xfs_mount *);
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit f41a0716f4b08678a73173d71ff3f409b996df2d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Don't stall the cowblocks scan on a locked inode if we possibly can. We'd much rather the background scanner keep moving.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 2b87317120e1..7427cf62d96b 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1605,17 +1605,31 @@ xfs_inode_free_cowblocks( void *args) { struct xfs_eofblocks *eofb = args; + bool wait; int ret = 0;
+ wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC); + if (!xfs_prep_free_cowblocks(ip)) return 0;
if (!xfs_inode_matches_eofb(ip, eofb)) return 0;
- /* Free the CoW blocks */ - xfs_ilock(ip, XFS_IOLOCK_EXCL); - xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + /* + * If the caller is waiting, return -EAGAIN to keep the background + * scanner moving and revisit the inode in a subsequent pass. + */ + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { + if (wait) + return -EAGAIN; + return 0; + } + if (!xfs_ilock_nowait(ip, XFS_MMAPLOCK_EXCL)) { + if (wait) + ret = -EAGAIN; + goto out_iolock; + }
/* * Check again, nobody else should be able to dirty blocks or change @@ -1625,6 +1639,7 @@ xfs_inode_free_cowblocks( ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false);
xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); +out_iolock: xfs_iunlock(ip, XFS_IOLOCK_EXCL);
return ret;
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit 9a537de3b009d95cfb048b7cbfe9bdb0f655596e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Buffered writers who have run out of quota reservation call xfs_inode_free_quota_blocks to try to free any space reservations that might reduce the quota usage. Unfortunately, the buffered write path treats "out of project quota" the same as "out of overall space" so this function has never supported scanning for space that might ease an "out of project quota" condition.
We're about to start using this function for cases where we actually /can/ tell if we're out of project quota, so add in this functionality.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 9 +++++++++ 1 file changed, 9 insertions(+)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 7427cf62d96b..9496985e2f09 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1434,6 +1434,15 @@ xfs_inode_free_quota_blocks( } }
+ if (XFS_IS_PQUOTA_ENFORCED(ip->i_mount)) { + dq = xfs_inode_dquot(ip, XFS_DQTYPE_PROJ); + if (dq && xfs_dquot_lowsp(dq)) { + eofb.eof_prid = ip->i_d.di_projid; + eofb.eof_flags |= XFS_EOF_FLAGS_PRID; + do_work = true; + } + } + if (!do_work) return false;
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit 3d4feec00673d34fbbfe0277d2e0ed1f51d20cb2 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Move this function further down in the file so that later cleanups won't have to declare static functions. Change the name because we're about to rework all the code that performs garbage collection of speculatively allocated file blocks. No functional changes.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_file.c | 2 +- fs/xfs/xfs_icache.c | 110 ++++++++++++++++++++++---------------------- fs/xfs/xfs_icache.h | 2 +- 3 files changed, 57 insertions(+), 57 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index b3e273c6c093..37c7bcf4c170 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -700,7 +700,7 @@ xfs_file_buffered_aio_write( */ if (ret == -EDQUOT && !cleared_space) { xfs_iunlock(ip, iolock); - cleared_space = xfs_inode_free_quota_blocks(ip); + cleared_space = xfs_blockgc_free_quota(ip); if (cleared_space) goto write_retry; iolock = 0; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 9496985e2f09..3641ee508040 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1396,61 +1396,6 @@ xfs_icache_free_eofblocks( XFS_ICI_EOFBLOCKS_TAG); }
-/* - * Run cow/eofblocks scans on the quotas applicable to the inode. For inodes - * with multiple quotas, we don't know exactly which quota caused an allocation - * failure. We make a best effort by including each quota under low free space - * conditions (less than 1% free space) in the scan. - */ -bool -xfs_inode_free_quota_blocks( - struct xfs_inode *ip) -{ - struct xfs_eofblocks eofb = {0}; - struct xfs_dquot *dq; - bool do_work = false; - - /* - * Run a sync scan to increase effectiveness and use the union filter to - * cover all applicable quotas in a single scan. - */ - eofb.eof_flags = XFS_EOF_FLAGS_UNION | XFS_EOF_FLAGS_SYNC; - - if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { - dq = xfs_inode_dquot(ip, XFS_DQTYPE_USER); - if (dq && xfs_dquot_lowsp(dq)) { - eofb.eof_uid = VFS_I(ip)->i_uid; - eofb.eof_flags |= XFS_EOF_FLAGS_UID; - do_work = true; - } - } - - if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) { - dq = xfs_inode_dquot(ip, XFS_DQTYPE_GROUP); - if (dq && xfs_dquot_lowsp(dq)) { - eofb.eof_gid = VFS_I(ip)->i_gid; - eofb.eof_flags |= XFS_EOF_FLAGS_GID; - do_work = true; - } - } - - if (XFS_IS_PQUOTA_ENFORCED(ip->i_mount)) { - dq = xfs_inode_dquot(ip, XFS_DQTYPE_PROJ); - if (dq && xfs_dquot_lowsp(dq)) { - eofb.eof_prid = ip->i_d.di_projid; - eofb.eof_flags |= XFS_EOF_FLAGS_PRID; - do_work = true; - } - } - - if (!do_work) - return false; - - xfs_icache_free_eofblocks(ip->i_mount, &eofb); - xfs_icache_free_cowblocks(ip->i_mount, &eofb); - return true; -} - static inline unsigned long xfs_iflag_for_tag( int tag) @@ -1699,3 +1644,58 @@ xfs_start_block_reaping( xfs_queue_eofblocks(mp); xfs_queue_cowblocks(mp); } + +/* + * Run cow/eofblocks scans on the quotas applicable to the inode. For inodes + * with multiple quotas, we don't know exactly which quota caused an allocation + * failure. We make a best effort by including each quota under low free space + * conditions (less than 1% free space) in the scan. + */ +bool +xfs_blockgc_free_quota( + struct xfs_inode *ip) +{ + struct xfs_eofblocks eofb = {0}; + struct xfs_dquot *dq; + bool do_work = false; + + /* + * Run a sync scan to increase effectiveness and use the union filter to + * cover all applicable quotas in a single scan. + */ + eofb.eof_flags = XFS_EOF_FLAGS_UNION | XFS_EOF_FLAGS_SYNC; + + if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { + dq = xfs_inode_dquot(ip, XFS_DQTYPE_USER); + if (dq && xfs_dquot_lowsp(dq)) { + eofb.eof_uid = VFS_I(ip)->i_uid; + eofb.eof_flags |= XFS_EOF_FLAGS_UID; + do_work = true; + } + } + + if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) { + dq = xfs_inode_dquot(ip, XFS_DQTYPE_GROUP); + if (dq && xfs_dquot_lowsp(dq)) { + eofb.eof_gid = VFS_I(ip)->i_gid; + eofb.eof_flags |= XFS_EOF_FLAGS_GID; + do_work = true; + } + } + + if (XFS_IS_PQUOTA_ENFORCED(ip->i_mount)) { + dq = xfs_inode_dquot(ip, XFS_DQTYPE_PROJ); + if (dq && xfs_dquot_lowsp(dq)) { + eofb.eof_prid = ip->i_d.di_projid; + eofb.eof_flags |= XFS_EOF_FLAGS_PRID; + do_work = true; + } + } + + if (!do_work) + return false; + + xfs_icache_free_eofblocks(ip->i_mount, &eofb); + xfs_icache_free_cowblocks(ip->i_mount, &eofb); + return true; +} diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 3f7ddbca8638..21b726a05b0d 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -54,7 +54,7 @@ long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
-bool xfs_inode_free_quota_blocks(struct xfs_inode *ip); +bool xfs_blockgc_free_quota(struct xfs_inode *ip);
void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit 111068f80eac00173816c2e822c52c316b650df3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Change the signature of xfs_blockgc_free_quota in preparation for the next few patches. Callers can now pass EOF_FLAGS into the function to control scan parameters; and the function will now pass back any corruption errors seen while scanning, though for our retry loops we'll just try again unconditionally.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_file.c | 10 +++++----- fs/xfs/xfs_icache.c | 26 +++++++++++++++++--------- fs/xfs/xfs_icache.h | 2 +- 3 files changed, 23 insertions(+), 15 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 37c7bcf4c170..e89b03d1110a 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -696,14 +696,14 @@ xfs_file_buffered_aio_write( * metadata space. This reduces the chances that the eofblocks scan * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this * also behaves as a filter to prevent too many eofblocks scans from - * running at the same time. + * running at the same time. Use a synchronous scan to increase the + * effectiveness of the scan. */ if (ret == -EDQUOT && !cleared_space) { xfs_iunlock(ip, iolock); - cleared_space = xfs_blockgc_free_quota(ip); - if (cleared_space) - goto write_retry; - iolock = 0; + xfs_blockgc_free_quota(ip, XFS_EOF_FLAGS_SYNC); + cleared_space = true; + goto write_retry; } else if (ret == -ENOSPC && !cleared_space) { struct xfs_eofblocks eofb = {0};
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 3641ee508040..47a698551111 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1650,20 +1650,26 @@ xfs_start_block_reaping( * with multiple quotas, we don't know exactly which quota caused an allocation * failure. We make a best effort by including each quota under low free space * conditions (less than 1% free space) in the scan. + * + * Callers must not hold any inode's ILOCK. If requesting a synchronous scan + * (XFS_EOF_FLAGS_SYNC), the caller also must not hold any inode's IOLOCK or + * MMAPLOCK. */ -bool +int xfs_blockgc_free_quota( - struct xfs_inode *ip) + struct xfs_inode *ip, + unsigned int eof_flags) { struct xfs_eofblocks eofb = {0}; struct xfs_dquot *dq; bool do_work = false; + int error;
/* - * Run a sync scan to increase effectiveness and use the union filter to - * cover all applicable quotas in a single scan. + * Run a scan to free blocks using the union filter to cover all + * applicable quotas in a single scan. */ - eofb.eof_flags = XFS_EOF_FLAGS_UNION | XFS_EOF_FLAGS_SYNC; + eofb.eof_flags = XFS_EOF_FLAGS_UNION | eof_flags;
if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { dq = xfs_inode_dquot(ip, XFS_DQTYPE_USER); @@ -1693,9 +1699,11 @@ xfs_blockgc_free_quota( }
if (!do_work) - return false; + return 0;
- xfs_icache_free_eofblocks(ip->i_mount, &eofb); - xfs_icache_free_cowblocks(ip->i_mount, &eofb); - return true; + error = xfs_icache_free_eofblocks(ip->i_mount, &eofb); + if (error) + return error; + + return xfs_icache_free_cowblocks(ip->i_mount, &eofb); } diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 21b726a05b0d..d64ea8f5c589 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -54,7 +54,7 @@ long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
-bool xfs_blockgc_free_quota(struct xfs_inode *ip); +int xfs_blockgc_free_quota(struct xfs_inode *ip, unsigned int eof_flags);
void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit 4ca74205685ee3a72ab7fe475f51cc26dea36509 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Now that we've converted xfs_reflink_remap_extent to use the new xfs_trans_alloc_inode API, we can focus on its slightly unusual behavior with regard to quota reservations.
Since it's valid to remap written blocks into a hole, we must be able to increase the quota count by the number of blocks in the mapping. However, the incore space reservation process requires us to supply an asymptotic guess before we can gain exclusive access to resources. We'd like to reserve all the quota we need up front, but we also don't want to fail a written -> allocated remap operation unnecessarily.
The solution is to make the remap_extents function call the transaction allocation function twice. The first time we ask to reserve enough space and quota to handle the absolute worst case situation, but if that fails, we can fall back to the old strategy: ask for the bare minimum space reservation upfront and increase the quota reservation later if we need to.
Later in this patchset we change the transaction and quota code to try to reclaim space if we cannot reserve free space or quota. Restructuring the remap_extent function in this manner means that if the fallback increase fails, we can pass that back to the caller knowing that the transaction allocation already tried freeing space.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_reflink.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-)
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 20e7b1706718..0fec495d0727 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -986,6 +986,7 @@ xfs_reflink_remap_extent( xfs_off_t newlen; int64_t qdelta = 0; unsigned int resblks; + bool quota_reserved = true; bool smap_real; bool dmap_written = xfs_bmap_is_written_extent(dmap); int nimaps; @@ -1000,10 +1001,26 @@ xfs_reflink_remap_extent( * the same index in the bmap btree, so we only need a reservation for * one bmbt split if either thing is happening. However, we haven't * locked the inode yet, so we reserve assuming this is the case. + * + * The first allocation call tries to reserve enough space to handle + * mapping dmap into a sparse part of the file plus the bmbt split. We + * haven't locked the inode or read the existing mapping yet, so we do + * not know for sure that we need the space. This should succeed most + * of the time. + * + * If the first attempt fails, try again but reserving only enough + * space to handle a bmbt split. This is the hard minimum requirement, + * and we revisit quota reservations later when we know more about what + * we're remapping. */ resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); - error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0, - false, &tp); + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, + resblks + dmap->br_blockcount, 0, false, &tp); + if (error == -EDQUOT || error == -ENOSPC) { + quota_reserved = false; + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, + resblks, 0, false, &tp); + } if (error) goto out;
@@ -1070,7 +1087,7 @@ xfs_reflink_remap_extent( * before we started. That should have removed all the delalloc * reservations, but we code defensively. */ - if (!smap_real && dmap_written) { + if (!quota_reserved && !smap_real && dmap_written) { error = xfs_trans_reserve_quota_nblks(tp, ip, dmap->br_blockcount, 0, false); if (error)
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit 766aabd59929cd05fc1a249f376e4395bed93d30 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
If a fs modification (data write, reflink, xattr set, fallocate, etc.) is unable to reserve enough quota to handle the modification, try clearing whatever space the filesystem might have been hanging onto in the hopes of speeding up the filesystem. The flushing behavior will become particularly important when we add deferred inode inactivation because that will increase the amount of space that isn't actively tied to user data.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_reflink.c | 5 +++++ fs/xfs/xfs_trans.c | 10 ++++++++++ 2 files changed, 15 insertions(+)
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 0fec495d0727..5e3f00f1192a 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1086,6 +1086,11 @@ xfs_reflink_remap_extent( * count. This is suboptimal, but the VFS flushed the dest range * before we started. That should have removed all the delalloc * reservations, but we code defensively. + * + * xfs_trans_alloc_inode above already tried to grab an even larger + * quota reservation, and kicked off a blockgc scan if it couldn't. + * If we can't get a potentially smaller quota reservation now, we're + * done. */ if (!quota_reserved && !smap_real && dmap_written) { error = xfs_trans_reserve_quota_nblks(tp, ip, diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 788a7d0fafaa..0f7152a9bfa4 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -23,6 +23,7 @@ #include "xfs_inode.h" #include "xfs_dquot_item.h" #include "xfs_dquot.h" +#include "xfs_icache.h"
kmem_zone_t *xfs_trans_zone;
@@ -1042,8 +1043,10 @@ xfs_trans_alloc_inode( { struct xfs_trans *tp; struct xfs_mount *mp = ip->i_mount; + bool retried = false; int error;
+retry: error = xfs_trans_alloc(mp, resv, dblocks, rblocks / mp->m_sb.sb_rextsize, force ? XFS_TRANS_RESERVE : 0, &tp); @@ -1061,6 +1064,13 @@ xfs_trans_alloc_inode( }
error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks, force); + if ((error == -EDQUOT || error == -ENOSPC) && !retried) { + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_blockgc_free_quota(ip, 0); + retried = true; + goto retry; + } if (error) goto out_cancel;
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit c237dd7c709432611a7642ca10c2a0c8c48ea313 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
If an inode creation is unable to reserve enough quota to handle the modification, try clearing whatever space the filesystem might have been hanging onto in the hopes of speeding up the filesystem. The flushing behavior will become particularly important when we add deferred inode inactivation because that will increase the amount of space that isn't actively tied to user data.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 68 +++++++++++++++++++++++++-------------------- fs/xfs/xfs_icache.h | 3 ++ fs/xfs/xfs_trans.c | 8 ++++++ 3 files changed, 49 insertions(+), 30 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 47a698551111..4bca8c4d3356 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1646,64 +1646,72 @@ xfs_start_block_reaping( }
/* - * Run cow/eofblocks scans on the quotas applicable to the inode. For inodes - * with multiple quotas, we don't know exactly which quota caused an allocation - * failure. We make a best effort by including each quota under low free space - * conditions (less than 1% free space) in the scan. + * Run cow/eofblocks scans on the supplied dquots. We don't know exactly which + * quota caused an allocation failure, so we make a best effort by including + * each quota under low free space conditions (less than 1% free space) in the + * scan. * * Callers must not hold any inode's ILOCK. If requesting a synchronous scan * (XFS_EOF_FLAGS_SYNC), the caller also must not hold any inode's IOLOCK or * MMAPLOCK. */ int -xfs_blockgc_free_quota( - struct xfs_inode *ip, +xfs_blockgc_free_dquots( + struct xfs_mount *mp, + struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, + struct xfs_dquot *pdqp, unsigned int eof_flags) { struct xfs_eofblocks eofb = {0}; - struct xfs_dquot *dq; bool do_work = false; int error;
+ if (!udqp && !gdqp && !pdqp) + return 0; + /* * Run a scan to free blocks using the union filter to cover all * applicable quotas in a single scan. */ eofb.eof_flags = XFS_EOF_FLAGS_UNION | eof_flags;
- if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { - dq = xfs_inode_dquot(ip, XFS_DQTYPE_USER); - if (dq && xfs_dquot_lowsp(dq)) { - eofb.eof_uid = VFS_I(ip)->i_uid; - eofb.eof_flags |= XFS_EOF_FLAGS_UID; - do_work = true; - } + if (XFS_IS_UQUOTA_ENFORCED(mp) && udqp && xfs_dquot_lowsp(udqp)) { + eofb.eof_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id); + eofb.eof_flags |= XFS_EOF_FLAGS_UID; + do_work = true; }
- if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) { - dq = xfs_inode_dquot(ip, XFS_DQTYPE_GROUP); - if (dq && xfs_dquot_lowsp(dq)) { - eofb.eof_gid = VFS_I(ip)->i_gid; - eofb.eof_flags |= XFS_EOF_FLAGS_GID; - do_work = true; - } + if (XFS_IS_UQUOTA_ENFORCED(mp) && gdqp && xfs_dquot_lowsp(gdqp)) { + eofb.eof_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id); + eofb.eof_flags |= XFS_EOF_FLAGS_GID; + do_work = true; }
- if (XFS_IS_PQUOTA_ENFORCED(ip->i_mount)) { - dq = xfs_inode_dquot(ip, XFS_DQTYPE_PROJ); - if (dq && xfs_dquot_lowsp(dq)) { - eofb.eof_prid = ip->i_d.di_projid; - eofb.eof_flags |= XFS_EOF_FLAGS_PRID; - do_work = true; - } + if (XFS_IS_PQUOTA_ENFORCED(mp) && pdqp && xfs_dquot_lowsp(pdqp)) { + eofb.eof_prid = pdqp->q_id; + eofb.eof_flags |= XFS_EOF_FLAGS_PRID; + do_work = true; }
if (!do_work) return 0;
- error = xfs_icache_free_eofblocks(ip->i_mount, &eofb); + error = xfs_icache_free_eofblocks(mp, &eofb); if (error) return error;
- return xfs_icache_free_cowblocks(ip->i_mount, &eofb); + return xfs_icache_free_cowblocks(mp, &eofb); +} + +/* Run cow/eofblocks scans on the quotas attached to the inode. */ +int +xfs_blockgc_free_quota( + struct xfs_inode *ip, + unsigned int eof_flags) +{ + return xfs_blockgc_free_dquots(ip->i_mount, + xfs_inode_dquot(ip, XFS_DQTYPE_USER), + xfs_inode_dquot(ip, XFS_DQTYPE_GROUP), + xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), eof_flags); } diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index d64ea8f5c589..5f7d7c192d1e 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -54,6 +54,9 @@ long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
+int xfs_blockgc_free_dquots(struct xfs_mount *mp, struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, + unsigned int eof_flags); int xfs_blockgc_free_quota(struct xfs_inode *ip, unsigned int eof_flags);
void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 0f7152a9bfa4..83f8222cb951 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1098,13 +1098,21 @@ xfs_trans_alloc_icreate( struct xfs_trans **tpp) { struct xfs_trans *tp; + bool retried = false; int error;
+retry: error = xfs_trans_alloc(mp, resv, dblocks, 0, 0, &tp); if (error) return error;
error = xfs_trans_reserve_quota_icreate(tp, udqp, gdqp, pdqp, dblocks); + if ((error == -EDQUOT || error == -ENOSPC) && !retried) { + xfs_trans_cancel(tp); + xfs_blockgc_free_dquots(mp, udqp, gdqp, pdqp, 0); + retried = true; + goto retry; + } if (error) { xfs_trans_cancel(tp); return error;
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit 758303d1449965819661048e9e31f32d61888f70 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
If a file user, group, or project change is unable to reserve enough quota to handle the modification, try clearing whatever space the filesystem might have been hanging onto in the hopes of speeding up the filesystem. The flushing behavior will become particularly important when we add deferred inode inactivation because that will increase the amount of space that isn't actively tied to user data.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_trans.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-)
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 83f8222cb951..78bacd133288 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1134,16 +1134,21 @@ xfs_trans_alloc_icreate( int xfs_trans_alloc_ichange( struct xfs_inode *ip, - struct xfs_dquot *udqp, - struct xfs_dquot *gdqp, - struct xfs_dquot *pdqp, + struct xfs_dquot *new_udqp, + struct xfs_dquot *new_gdqp, + struct xfs_dquot *new_pdqp, bool force, struct xfs_trans **tpp) { struct xfs_trans *tp; struct xfs_mount *mp = ip->i_mount; + struct xfs_dquot *udqp; + struct xfs_dquot *gdqp; + struct xfs_dquot *pdqp; + bool retried = false; int error;
+retry: error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); if (error) return error; @@ -1161,14 +1166,12 @@ xfs_trans_alloc_ichange( /* * For each quota type, skip quota reservations if the inode's dquots * now match the ones that came from the caller, or the caller didn't - * pass one in. + * pass one in. The inode's dquots can change if we drop the ILOCK to + * perform a blockgc scan, so we must preserve the caller's arguments. */ - if (udqp == ip->i_udquot) - udqp = NULL; - if (gdqp == ip->i_gdquot) - gdqp = NULL; - if (pdqp == ip->i_pdquot) - pdqp = NULL; + udqp = (new_udqp != ip->i_udquot) ? new_udqp : NULL; + gdqp = (new_gdqp != ip->i_gdquot) ? new_gdqp : NULL; + pdqp = (new_pdqp != ip->i_pdquot) ? new_pdqp : NULL; if (udqp || gdqp || pdqp) { unsigned int qflags = XFS_QMOPT_RES_REGBLKS;
@@ -1184,6 +1187,12 @@ xfs_trans_alloc_ichange( error = xfs_trans_reserve_quota_bydquots(tp, mp, udqp, gdqp, pdqp, ip->i_d.di_nblocks + ip->i_delayed_blks, 1, qflags); + if ((error == -EDQUOT || error == -ENOSPC) && !retried) { + xfs_trans_cancel(tp); + xfs_blockgc_free_dquots(mp, udqp, gdqp, pdqp, 0); + retried = true; + goto retry; + } if (error) goto out_cancel; }
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit 38899f8099945559662e6a6e355b9059088e3b34 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Add some tracepoints so that we can observe when the speculative preallocation garbage collector runs.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_ioctl.c | 2 ++ fs/xfs/xfs_trace.c | 1 + fs/xfs/xfs_trace.h | 41 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 44 insertions(+)
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index cde305a15c84..855aefbc1d83 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -2339,6 +2339,8 @@ xfs_file_ioctl( if (error) return error;
+ trace_xfs_ioc_free_eofblocks(mp, &keofb, _RET_IP_); + sb_start_write(mp->m_super); error = xfs_icache_free_eofblocks(mp, &keofb); sb_end_write(mp->m_super); diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 120398a37c2a..9b8d703dc9fd 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -29,6 +29,7 @@ #include "xfs_filestream.h" #include "xfs_fsmap.h" #include "xfs_btree_staging.h" +#include "xfs_icache.h"
/* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 54a7b3c63d7b..454f78dd4fb3 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -37,6 +37,7 @@ struct xfs_trans_res; struct xfs_inobt_rec_incore; union xfs_btree_ptr; struct xfs_dqtrx; +struct xfs_eofblocks;
#define XFS_ATTR_FILTER_FLAGS \ { XFS_ATTR_ROOT, "ROOT" }, \ @@ -3868,6 +3869,46 @@ DEFINE_EVENT(xfs_timestamp_range_class, name, \ DEFINE_TIMESTAMP_RANGE_EVENT(xfs_inode_timestamp_range); DEFINE_TIMESTAMP_RANGE_EVENT(xfs_quota_expiry_range);
+DECLARE_EVENT_CLASS(xfs_eofblocks_class, + TP_PROTO(struct xfs_mount *mp, struct xfs_eofblocks *eofb, + unsigned long caller_ip), + TP_ARGS(mp, eofb, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(__u32, flags) + __field(uint32_t, uid) + __field(uint32_t, gid) + __field(prid_t, prid) + __field(__u64, min_file_size) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->flags = eofb ? eofb->eof_flags : 0; + __entry->uid = eofb ? from_kuid(mp->m_super->s_user_ns, + eofb->eof_uid) : 0; + __entry->gid = eofb ? from_kgid(mp->m_super->s_user_ns, + eofb->eof_gid) : 0; + __entry->prid = eofb ? eofb->eof_prid : 0; + __entry->min_file_size = eofb ? eofb->eof_min_file_size : 0; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d flags 0x%x uid %u gid %u prid %u minsize %llu caller %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->flags, + __entry->uid, + __entry->gid, + __entry->prid, + __entry->min_file_size, + (char *)__entry->caller_ip) +); +#define DEFINE_EOFBLOCKS_EVENT(name) \ +DEFINE_EVENT(xfs_eofblocks_class, name, \ + TP_PROTO(struct xfs_mount *mp, struct xfs_eofblocks *eofb, \ + unsigned long caller_ip), \ + TP_ARGS(mp, eofb, caller_ip)) +DEFINE_EOFBLOCKS_EVENT(xfs_ioc_free_eofblocks); + #endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit 85c5b27075ba0389855d9f46ff1b1d5c34a44c94 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
In anticipation of more restructuring of the eof/cowblocks gc code, refactor calling of those two functions into a single internal helper function, then present a new standard interface to purge speculative block preallocations and start shifting higher level code to use that.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_file.c | 3 +-- fs/xfs/xfs_icache.c | 39 +++++++++++++++++++++++++++++++++------ fs/xfs/xfs_icache.h | 1 + fs/xfs/xfs_trace.h | 1 + 4 files changed, 36 insertions(+), 8 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e89b03d1110a..139df7fde2e6 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -712,8 +712,7 @@ xfs_file_buffered_aio_write(
xfs_iunlock(ip, iolock); eofb.eof_flags = XFS_EOF_FLAGS_SYNC; - xfs_icache_free_eofblocks(ip->i_mount, &eofb); - xfs_icache_free_cowblocks(ip->i_mount, &eofb); + xfs_blockgc_free_space(ip->i_mount, &eofb); goto write_retry; }
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 4bca8c4d3356..e63eba69183c 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1645,6 +1645,38 @@ xfs_start_block_reaping( xfs_queue_cowblocks(mp); }
+/* Scan all incore inodes for block preallocations that we can remove. */ +static inline int +xfs_blockgc_scan( + struct xfs_mount *mp, + struct xfs_eofblocks *eofb) +{ + int error; + + error = xfs_icache_free_eofblocks(mp, eofb); + if (error) + return error; + + error = xfs_icache_free_cowblocks(mp, eofb); + if (error) + return error; + + return 0; +} + +/* + * Try to free space in the filesystem by purging eofblocks and cowblocks. + */ +int +xfs_blockgc_free_space( + struct xfs_mount *mp, + struct xfs_eofblocks *eofb) +{ + trace_xfs_blockgc_free_space(mp, eofb, _RET_IP_); + + return xfs_blockgc_scan(mp, eofb); +} + /* * Run cow/eofblocks scans on the supplied dquots. We don't know exactly which * quota caused an allocation failure, so we make a best effort by including @@ -1665,7 +1697,6 @@ xfs_blockgc_free_dquots( { struct xfs_eofblocks eofb = {0}; bool do_work = false; - int error;
if (!udqp && !gdqp && !pdqp) return 0; @@ -1697,11 +1728,7 @@ xfs_blockgc_free_dquots( if (!do_work) return 0;
- error = xfs_icache_free_eofblocks(mp, &eofb); - if (error) - return error; - - return xfs_icache_free_cowblocks(mp, &eofb); + return xfs_blockgc_free_space(mp, &eofb); }
/* Run cow/eofblocks scans on the quotas attached to the inode. */ diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 5f7d7c192d1e..f7dc8d1c91e5 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -58,6 +58,7 @@ int xfs_blockgc_free_dquots(struct xfs_mount *mp, struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, unsigned int eof_flags); int xfs_blockgc_free_quota(struct xfs_inode *ip, unsigned int eof_flags); +int xfs_blockgc_free_space(struct xfs_mount *mp, struct xfs_eofblocks *eofb);
void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 454f78dd4fb3..4a0f4354992c 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3908,6 +3908,7 @@ DEFINE_EVENT(xfs_eofblocks_class, name, \ unsigned long caller_ip), \ TP_ARGS(mp, eofb, caller_ip)) DEFINE_EOFBLOCKS_EVENT(xfs_ioc_free_eofblocks); +DEFINE_EOFBLOCKS_EVENT(xfs_blockgc_free_space);
#endif /* _TRACE_XFS_H */
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit a1a7d05a05765eec042942a5c360e909c0dd0131 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
If a fs modification (creation, file write, reflink, etc.) is unable to reserve enough space to handle the modification, try clearing whatever space the filesystem might have been hanging onto in the hopes of speeding up the filesystem. The flushing behavior will become particularly important when we add deferred inode inactivation because that will increase the amount of space that isn't actively tied to user data.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_trans.c | 11 +++++++++++ 1 file changed, 11 insertions(+)
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 78bacd133288..3a628c2ea8ac 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -289,6 +289,17 @@ xfs_trans_alloc( tp->t_firstblock = NULLFSBLOCK;
error = xfs_trans_reserve(tp, resp, blocks, rtextents); + if (error == -ENOSPC) { + /* + * We weren't able to reserve enough space for the transaction. + * Flush the other speculative space allocations to free space. + * Do not perform a synchronous scan because callers can hold + * other locks. + */ + error = xfs_blockgc_free_space(mp, NULL); + if (!error) + error = xfs_trans_reserve(tp, resp, blocks, rtextents); + } if (error) { xfs_trans_cancel(tp); return error;
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit f83d436aef5def77b318effc14809fdc57092588 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Increase the parallelism level for pwork clients to the workqueue defaults so that we can take advantage of computers with a lot of CPUs and a lot of hardware. On fast systems this will speed up quotacheck by a large factor, and the following posteof/cowblocks cleanup series will use the functionality presented in this patch to run garbage collection as quickly as possible.
We do this by switching the pwork workqueue to unbounded, since the current user (quotacheck) runs lengthy scans for each work item and we don't care about dispatching the work on a warm cpu cache or anything like that. Also set WQ_SYSFS so that we can monitor where the wq is running.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- Documentation/admin-guide/xfs.rst | 38 +++++++++++++++++++++++++++++++ fs/xfs/xfs_iwalk.c | 5 +--- fs/xfs/xfs_pwork.c | 25 ++++---------------- fs/xfs/xfs_pwork.h | 4 +--- 4 files changed, 45 insertions(+), 27 deletions(-)
diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst index 86de8a1ad91c..b00b1eece9de 100644 --- a/Documentation/admin-guide/xfs.rst +++ b/Documentation/admin-guide/xfs.rst @@ -495,3 +495,41 @@ the class and error context. For example, the default values for "metadata/ENODEV" are "0" rather than "-1" so that this error handler defaults to "fail immediately" behaviour. This is done because ENODEV is a fatal, unrecoverable error no matter how many times the metadata IO is retried. + +Workqueue Concurrency +===================== + +XFS uses kernel workqueues to parallelize metadata update processes. This +enables it to take advantage of storage hardware that can service many IO +operations simultaneously. This interface exposes internal implementation +details of XFS, and as such is explicitly not part of any userspace API/ABI +guarantee the kernel may give userspace. These are undocumented features of +the generic workqueue implementation XFS uses for concurrency, and they are +provided here purely for diagnostic and tuning purposes and may change at any +time in the future. + +The control knobs for a filesystem's workqueues are organized by task at hand +and the short name of the data device. They all can be found in: + + /sys/bus/workqueue/devices/${task}!${device} + +================ =========== + Task Description +================ =========== + xfs_iwalk-$pid Inode scans of the entire filesystem. Currently limited to + mount time quotacheck. +================ =========== + +For example, the knobs for the quotacheck workqueue for /dev/nvme0n1 would be +found in /sys/bus/workqueue/devices/xfs_iwalk-1111!nvme0n1/. + +The interesting knobs for XFS workqueues are as follows: + +============ =========== + Knob Description +============ =========== + max_active Maximum number of background threads that can be started to + run the work. + cpumask CPUs upon which the threads are allowed to run. + nice Relative priority of scheduling the threads. These are the + same nice levels that can be applied to userspace processes. diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index 2a45138831e3..708874851dbd 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -618,15 +618,12 @@ xfs_iwalk_threaded( { struct xfs_pwork_ctl pctl; xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); - unsigned int nr_threads; int error;
ASSERT(agno < mp->m_sb.sb_agcount); ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL));
- nr_threads = xfs_pwork_guess_datadev_parallelism(mp); - error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk", - nr_threads); + error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk"); if (error) return error;
diff --git a/fs/xfs/xfs_pwork.c b/fs/xfs/xfs_pwork.c index b03333f1c84a..c283b801cc5d 100644 --- a/fs/xfs/xfs_pwork.c +++ b/fs/xfs/xfs_pwork.c @@ -61,16 +61,18 @@ xfs_pwork_init( struct xfs_mount *mp, struct xfs_pwork_ctl *pctl, xfs_pwork_work_fn work_fn, - const char *tag, - unsigned int nr_threads) + const char *tag) { + unsigned int nr_threads = 0; + #ifdef DEBUG if (xfs_globals.pwork_threads >= 0) nr_threads = xfs_globals.pwork_threads; #endif trace_xfs_pwork_init(mp, nr_threads, current->pid);
- pctl->wq = alloc_workqueue("%s-%d", WQ_FREEZABLE, nr_threads, tag, + pctl->wq = alloc_workqueue("%s-%d", + WQ_UNBOUND | WQ_SYSFS | WQ_FREEZABLE, nr_threads, tag, current->pid); if (!pctl->wq) return -ENOMEM; @@ -117,20 +119,3 @@ xfs_pwork_poll( atomic_read(&pctl->nr_work) == 0, HZ) == 0) touch_softlockup_watchdog(); } - -/* - * Return the amount of parallelism that the data device can handle, or 0 for - * no limit. - */ -unsigned int -xfs_pwork_guess_datadev_parallelism( - struct xfs_mount *mp) -{ - struct xfs_buftarg *btp = mp->m_ddev_targp; - - /* - * For now we'll go with the most conservative setting possible, - * which is two threads for an SSD and 1 thread everywhere else. - */ - return blk_queue_nonrot(btp->bt_bdev->bd_disk->queue) ? 2 : 1; -} diff --git a/fs/xfs/xfs_pwork.h b/fs/xfs/xfs_pwork.h index 8133124cf3bb..c0ef81fc85dd 100644 --- a/fs/xfs/xfs_pwork.h +++ b/fs/xfs/xfs_pwork.h @@ -51,11 +51,9 @@ xfs_pwork_want_abort( }
int xfs_pwork_init(struct xfs_mount *mp, struct xfs_pwork_ctl *pctl, - xfs_pwork_work_fn work_fn, const char *tag, - unsigned int nr_threads); + xfs_pwork_work_fn work_fn, const char *tag); void xfs_pwork_queue(struct xfs_pwork_ctl *pctl, struct xfs_pwork *pwork); int xfs_pwork_destroy(struct xfs_pwork_ctl *pctl); void xfs_pwork_poll(struct xfs_pwork_ctl *pctl); -unsigned int xfs_pwork_guess_datadev_parallelism(struct xfs_mount *mp);
#endif /* __XFS_PWORK_H__ */
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit 05a302a17062ca73dc91b508cf2a0b25724db15d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
When CONFIG_XFS_DEBUG=y, set WQ_SYSFS on all workqueues that we create so that we (developers) have a means to monitor cpu affinity and whatnot for background workers. In the next patchset we'll expose knobs for more of the workqueues publicly and document it, but not now.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_log.c | 5 +++-- fs/xfs/xfs_mru_cache.c | 2 +- fs/xfs/xfs_super.c | 23 ++++++++++++++--------- fs/xfs/xfs_super.h | 6 ++++++ 4 files changed, 24 insertions(+), 12 deletions(-)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 200b92fae5bb..46cb093e365f 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1418,8 +1418,9 @@ xlog_alloc_log( log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */
log->l_ioend_workqueue = alloc_workqueue("xfs-log/%s", - WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_HIGHPRI, 0, - mp->m_super->s_id); + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | + WQ_HIGHPRI), + 0, mp->m_super->s_id); if (!log->l_ioend_workqueue) goto out_free_iclog;
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index a06661dac5be..34c3b16f834f 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -294,7 +294,7 @@ int xfs_mru_cache_init(void) { xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 1); + XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 1); if (!xfs_mru_reap_wq) return -ENOMEM; return 0; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 1eca53df25f6..2190f8e15d61 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -515,33 +515,37 @@ xfs_init_mount_workqueues( struct xfs_mount *mp) { mp->m_buf_workqueue = alloc_workqueue("xfs-buf/%s", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 1, mp->m_super->s_id); + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + 1, mp->m_super->s_id); if (!mp->m_buf_workqueue) goto out;
mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id); + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + 0, mp->m_super->s_id); if (!mp->m_unwritten_workqueue) goto out_destroy_buf;
mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s", - WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND, + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_UNBOUND), 0, mp->m_super->s_id); if (!mp->m_cil_workqueue) goto out_destroy_unwritten;
mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id); + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + 0, mp->m_super->s_id); if (!mp->m_reclaim_workqueue) goto out_destroy_cil;
mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id); + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + 0, mp->m_super->s_id); if (!mp->m_eofblocks_workqueue) goto out_destroy_reclaim;
- mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", WQ_FREEZABLE, 0, - mp->m_super->s_id); + mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", + XFS_WQFLAGS(WQ_FREEZABLE), 0, mp->m_super->s_id); if (!mp->m_sync_workqueue) goto out_destroy_eofb;
@@ -2109,11 +2113,12 @@ xfs_init_workqueues(void) * max_active value for this workqueue. */ xfs_alloc_wq = alloc_workqueue("xfsalloc", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 0); + XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 0); if (!xfs_alloc_wq) return -ENOMEM;
- xfs_discard_wq = alloc_workqueue("xfsdiscard", WQ_UNBOUND, 0); + xfs_discard_wq = alloc_workqueue("xfsdiscard", XFS_WQFLAGS(WQ_UNBOUND), + 0); if (!xfs_discard_wq) goto out_free_alloc_wq;
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index b552cf6d3379..1ca484b8357f 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -75,6 +75,12 @@ extern void xfs_qm_exit(void); XFS_ASSERT_FATAL_STRING \ XFS_DBG_STRING /* DBG must be last */
+#ifdef DEBUG +# define XFS_WQFLAGS(wqflags) (WQ_SYSFS | (wqflags)) +#else +# define XFS_WQFLAGS(wqflags) (wqflags) +#endif + struct xfs_inode; struct xfs_mount; struct xfs_buftarg;
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit f9296569837c3fd66ae32717b0f8f5a26758b4b7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Move the xfs_{eof,cow}blocks_worker and xfs_queue_{eof,cow}blocks functions further down in the file so that the cleanups in the next patches won't have to pre-declare static functions. No functional changes.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 126 ++++++++++++++++++++++---------------------- 1 file changed, 63 insertions(+), 63 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index e63eba69183c..edbd926d35dc 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -913,69 +913,6 @@ xfs_inode_walk( return last_error; }
-/* - * Background scanning to trim post-EOF preallocated space. This is queued - * based on the 'speculative_prealloc_lifetime' tunable (5m by default). - */ -void -xfs_queue_eofblocks( - struct xfs_mount *mp) -{ - rcu_read_lock(); - if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG)) - queue_delayed_work(mp->m_eofblocks_workqueue, - &mp->m_eofblocks_work, - msecs_to_jiffies(xfs_eofb_secs * 1000)); - rcu_read_unlock(); -} - -void -xfs_eofblocks_worker( - struct work_struct *work) -{ - struct xfs_mount *mp = container_of(to_delayed_work(work), - struct xfs_mount, m_eofblocks_work); - - if (!sb_start_write_trylock(mp->m_super)) - return; - xfs_icache_free_eofblocks(mp, NULL); - sb_end_write(mp->m_super); - - xfs_queue_eofblocks(mp); -} - -/* - * Background scanning to trim preallocated CoW space. This is queued - * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default). - * (We'll just piggyback on the post-EOF prealloc space workqueue.) - */ -void -xfs_queue_cowblocks( - struct xfs_mount *mp) -{ - rcu_read_lock(); - if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_COWBLOCKS_TAG)) - queue_delayed_work(mp->m_eofblocks_workqueue, - &mp->m_cowblocks_work, - msecs_to_jiffies(xfs_cowb_secs * 1000)); - rcu_read_unlock(); -} - -void -xfs_cowblocks_worker( - struct work_struct *work) -{ - struct xfs_mount *mp = container_of(to_delayed_work(work), - struct xfs_mount, m_cowblocks_work); - - if (!sb_start_write_trylock(mp->m_super)) - return; - xfs_icache_free_cowblocks(mp, NULL); - sb_end_write(mp->m_super); - - xfs_queue_cowblocks(mp); -} - /* * Grab the inode for reclaim exclusively. * @@ -1396,6 +1333,37 @@ xfs_icache_free_eofblocks( XFS_ICI_EOFBLOCKS_TAG); }
+/* + * Background scanning to trim post-EOF preallocated space. This is queued + * based on the 'speculative_prealloc_lifetime' tunable (5m by default). + */ +void +xfs_queue_eofblocks( + struct xfs_mount *mp) +{ + rcu_read_lock(); + if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG)) + queue_delayed_work(mp->m_eofblocks_workqueue, + &mp->m_eofblocks_work, + msecs_to_jiffies(xfs_eofb_secs * 1000)); + rcu_read_unlock(); +} + +void +xfs_eofblocks_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_eofblocks_work); + + if (!sb_start_write_trylock(mp->m_super)) + return; + xfs_icache_free_eofblocks(mp, NULL); + sb_end_write(mp->m_super); + + xfs_queue_eofblocks(mp); +} + static inline unsigned long xfs_iflag_for_tag( int tag) @@ -1608,6 +1576,38 @@ xfs_icache_free_cowblocks( XFS_ICI_COWBLOCKS_TAG); }
+/* + * Background scanning to trim preallocated CoW space. This is queued + * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default). + * (We'll just piggyback on the post-EOF prealloc space workqueue.) + */ +void +xfs_queue_cowblocks( + struct xfs_mount *mp) +{ + rcu_read_lock(); + if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_COWBLOCKS_TAG)) + queue_delayed_work(mp->m_eofblocks_workqueue, + &mp->m_cowblocks_work, + msecs_to_jiffies(xfs_cowb_secs * 1000)); + rcu_read_unlock(); +} + +void +xfs_cowblocks_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_cowblocks_work); + + if (!sb_start_write_trylock(mp->m_super)) + return; + xfs_icache_free_cowblocks(mp, NULL); + sb_end_write(mp->m_super); + + xfs_queue_cowblocks(mp); +} + void xfs_inode_set_cowblocks_tag( xfs_inode_t *ip)
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit 0461a320e33a16405ac3c165463837e028a42680 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Change the one remaining caller of xfs_icache_free_eofblocks to use our new combined blockgc scan function instead, since we will soon be combining the two scans. This introduces a slight behavior change, since the XFS_IOC_FREE_EOFBLOCKS now clears out speculative CoW reservations in addition to post-eof blocks.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 2 +- fs/xfs/xfs_icache.h | 1 - fs/xfs/xfs_ioctl.c | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index edbd926d35dc..7621d6e67bf3 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1324,7 +1324,7 @@ xfs_inode_free_eofblocks( return ret; }
-int +static int xfs_icache_free_eofblocks( struct xfs_mount *mp, struct xfs_eofblocks *eofb) diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index f7dc8d1c91e5..b8c2f23aa829 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -62,7 +62,6 @@ int xfs_blockgc_free_space(struct xfs_mount *mp, struct xfs_eofblocks *eofb);
void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); -int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *); void xfs_eofblocks_worker(struct work_struct *); void xfs_queue_eofblocks(struct xfs_mount *);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 855aefbc1d83..57cdfac0e7d3 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -2342,7 +2342,7 @@ xfs_file_ioctl( trace_xfs_ioc_free_eofblocks(mp, &keofb, _RET_IP_);
sb_start_write(mp->m_super); - error = xfs_icache_free_eofblocks(mp, &keofb); + error = xfs_blockgc_free_space(mp, &keofb); sb_end_write(mp->m_super); return error; }
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit b943c0cd5615233ae4cea66666725a9bf2edccca category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Change the one remaining caller of xfs_icache_free_cowblocks to use our new combined blockgc scan function instead, since we will soon be combining the two scans. This introduces a slight behavior change, since a readonly remount now clears out post-EOF preallocations and not just CoW staging extents.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 2 +- fs/xfs/xfs_icache.h | 1 - fs/xfs/xfs_super.c | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 7621d6e67bf3..ba546b7d861a 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1567,7 +1567,7 @@ xfs_inode_free_cowblocks( return ret; }
-int +static int xfs_icache_free_cowblocks( struct xfs_mount *mp, struct xfs_eofblocks *eofb) diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index b8c2f23aa829..de7604576e9e 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -67,7 +67,6 @@ void xfs_queue_eofblocks(struct xfs_mount *);
void xfs_inode_set_cowblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip); -int xfs_icache_free_cowblocks(struct xfs_mount *, struct xfs_eofblocks *); void xfs_cowblocks_worker(struct work_struct *); void xfs_queue_cowblocks(struct xfs_mount *);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 2190f8e15d61..743ffa901706 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1737,7 +1737,7 @@ xfs_remount_ro( xfs_stop_block_reaping(mp);
/* Get rid of any leftover CoW reservations... */ - error = xfs_icache_free_cowblocks(mp, NULL); + error = xfs_blockgc_free_space(mp, NULL); if (error) { xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return error;
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit 865ac8e253c97423c41e22ce615615eb006fc52e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Get rid of these trivial helpers.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index ba546b7d861a..352322fe441a 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1324,15 +1324,6 @@ xfs_inode_free_eofblocks( return ret; }
-static int -xfs_icache_free_eofblocks( - struct xfs_mount *mp, - struct xfs_eofblocks *eofb) -{ - return xfs_inode_walk(mp, 0, xfs_inode_free_eofblocks, eofb, - XFS_ICI_EOFBLOCKS_TAG); -} - /* * Background scanning to trim post-EOF preallocated space. This is queued * based on the 'speculative_prealloc_lifetime' tunable (5m by default). @@ -1358,7 +1349,8 @@ xfs_eofblocks_worker(
if (!sb_start_write_trylock(mp->m_super)) return; - xfs_icache_free_eofblocks(mp, NULL); + xfs_inode_walk(mp, 0, xfs_inode_free_eofblocks, NULL, + XFS_ICI_EOFBLOCKS_TAG); sb_end_write(mp->m_super);
xfs_queue_eofblocks(mp); @@ -1567,15 +1559,6 @@ xfs_inode_free_cowblocks( return ret; }
-static int -xfs_icache_free_cowblocks( - struct xfs_mount *mp, - struct xfs_eofblocks *eofb) -{ - return xfs_inode_walk(mp, 0, xfs_inode_free_cowblocks, eofb, - XFS_ICI_COWBLOCKS_TAG); -} - /* * Background scanning to trim preallocated CoW space. This is queued * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default). @@ -1602,7 +1585,8 @@ xfs_cowblocks_worker(
if (!sb_start_write_trylock(mp->m_super)) return; - xfs_icache_free_cowblocks(mp, NULL); + xfs_inode_walk(mp, 0, xfs_inode_free_cowblocks, NULL, + XFS_ICI_COWBLOCKS_TAG); sb_end_write(mp->m_super);
xfs_queue_cowblocks(mp); @@ -1653,11 +1637,13 @@ xfs_blockgc_scan( { int error;
- error = xfs_icache_free_eofblocks(mp, eofb); + error = xfs_inode_walk(mp, 0, xfs_inode_free_eofblocks, eofb, + XFS_ICI_EOFBLOCKS_TAG); if (error) return error;
- error = xfs_icache_free_cowblocks(mp, eofb); + error = xfs_inode_walk(mp, 0, xfs_inode_free_cowblocks, eofb, + XFS_ICI_COWBLOCKS_TAG); if (error) return error;
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit ce2d3bbe06473fa76eb9dad21529f9cc48408000 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
The clearing of posteof blocks and cowblocks serve the same purpose: removing speculative block preallocations from inactive files. We don't need to burn two radix tree tags on this, so combine them into one.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 114 +++++++++++++++++++++----------------------- fs/xfs/xfs_icache.h | 4 +- fs/xfs/xfs_trace.h | 6 +-- 3 files changed, 58 insertions(+), 66 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 352322fe441a..7249764b5900 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1291,6 +1291,9 @@ xfs_inode_free_eofblocks(
wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC);
+ if (!xfs_iflags_test(ip, XFS_IEOFBLOCKS)) + return 0; + if (!xfs_can_free_eofblocks(ip, false)) { /* inode could be preallocated or append-only */ trace_xfs_inode_free_eofblocks_invalid(ip); @@ -1333,7 +1336,7 @@ xfs_queue_eofblocks( struct xfs_mount *mp) { rcu_read_lock(); - if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG)) + if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_BLOCKGC_TAG)) queue_delayed_work(mp->m_eofblocks_workqueue, &mp->m_eofblocks_work, msecs_to_jiffies(xfs_eofb_secs * 1000)); @@ -1350,67 +1353,54 @@ xfs_eofblocks_worker( if (!sb_start_write_trylock(mp->m_super)) return; xfs_inode_walk(mp, 0, xfs_inode_free_eofblocks, NULL, - XFS_ICI_EOFBLOCKS_TAG); + XFS_ICI_BLOCKGC_TAG); sb_end_write(mp->m_super);
xfs_queue_eofblocks(mp); }
-static inline unsigned long -xfs_iflag_for_tag( - int tag) -{ - switch (tag) { - case XFS_ICI_EOFBLOCKS_TAG: - return XFS_IEOFBLOCKS; - case XFS_ICI_COWBLOCKS_TAG: - return XFS_ICOWBLOCKS; - default: - ASSERT(0); - return 0; - } -} - static void -__xfs_inode_set_blocks_tag( - xfs_inode_t *ip, - void (*execute)(struct xfs_mount *mp), - void (*set_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, - int error, unsigned long caller_ip), - int tag) +xfs_blockgc_set_iflag( + struct xfs_inode *ip, + void (*execute)(struct xfs_mount *mp), + unsigned long iflag) { - struct xfs_mount *mp = ip->i_mount; - struct xfs_perag *pag; - int tagged; + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + int tagged; + + ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
/* * Don't bother locking the AG and looking up in the radix trees * if we already know that we have the tag set. */ - if (ip->i_flags & xfs_iflag_for_tag(tag)) + if (ip->i_flags & iflag) return; spin_lock(&ip->i_flags_lock); - ip->i_flags |= xfs_iflag_for_tag(tag); + ip->i_flags |= iflag; spin_unlock(&ip->i_flags_lock);
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); spin_lock(&pag->pag_ici_lock);
- tagged = radix_tree_tagged(&pag->pag_ici_root, tag); + tagged = radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG); radix_tree_tag_set(&pag->pag_ici_root, - XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag); + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), + XFS_ICI_BLOCKGC_TAG); if (!tagged) { - /* propagate the eofblocks tag up into the perag radix tree */ + /* propagate the blockgc tag up into the perag radix tree */ spin_lock(&ip->i_mount->m_perag_lock); radix_tree_tag_set(&ip->i_mount->m_perag_tree, XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), - tag); + XFS_ICI_BLOCKGC_TAG); spin_unlock(&ip->i_mount->m_perag_lock);
/* kick off background trimming */ execute(ip->i_mount);
- set_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_); + trace_xfs_perag_set_blockgc(ip->i_mount, pag->pag_agno, -1, + _RET_IP_); }
spin_unlock(&pag->pag_ici_lock); @@ -1422,38 +1412,43 @@ xfs_inode_set_eofblocks_tag( xfs_inode_t *ip) { trace_xfs_inode_set_eofblocks_tag(ip); - return __xfs_inode_set_blocks_tag(ip, xfs_queue_eofblocks, - trace_xfs_perag_set_eofblocks, - XFS_ICI_EOFBLOCKS_TAG); + return xfs_blockgc_set_iflag(ip, xfs_queue_eofblocks, XFS_IEOFBLOCKS); }
static void -__xfs_inode_clear_blocks_tag( - xfs_inode_t *ip, - void (*clear_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, - int error, unsigned long caller_ip), - int tag) +xfs_blockgc_clear_iflag( + struct xfs_inode *ip, + unsigned long iflag) { - struct xfs_mount *mp = ip->i_mount; - struct xfs_perag *pag; + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + bool clear_tag; + + ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
spin_lock(&ip->i_flags_lock); - ip->i_flags &= ~xfs_iflag_for_tag(tag); + ip->i_flags &= ~iflag; + clear_tag = (ip->i_flags & (XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0; spin_unlock(&ip->i_flags_lock);
+ if (!clear_tag) + return; + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); spin_lock(&pag->pag_ici_lock);
radix_tree_tag_clear(&pag->pag_ici_root, - XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag); - if (!radix_tree_tagged(&pag->pag_ici_root, tag)) { - /* clear the eofblocks tag from the perag radix tree */ + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), + XFS_ICI_BLOCKGC_TAG); + if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) { + /* clear the blockgc tag from the perag radix tree */ spin_lock(&ip->i_mount->m_perag_lock); radix_tree_tag_clear(&ip->i_mount->m_perag_tree, XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), - tag); + XFS_ICI_BLOCKGC_TAG); spin_unlock(&ip->i_mount->m_perag_lock); - clear_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_); + trace_xfs_perag_clear_blockgc(ip->i_mount, pag->pag_agno, -1, + _RET_IP_); }
spin_unlock(&pag->pag_ici_lock); @@ -1465,8 +1460,7 @@ xfs_inode_clear_eofblocks_tag( xfs_inode_t *ip) { trace_xfs_inode_clear_eofblocks_tag(ip); - return __xfs_inode_clear_blocks_tag(ip, - trace_xfs_perag_clear_eofblocks, XFS_ICI_EOFBLOCKS_TAG); + return xfs_blockgc_clear_iflag(ip, XFS_IEOFBLOCKS); }
/* @@ -1524,6 +1518,9 @@ xfs_inode_free_cowblocks(
wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC);
+ if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS)) + return 0; + if (!xfs_prep_free_cowblocks(ip)) return 0;
@@ -1569,7 +1566,7 @@ xfs_queue_cowblocks( struct xfs_mount *mp) { rcu_read_lock(); - if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_COWBLOCKS_TAG)) + if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_BLOCKGC_TAG)) queue_delayed_work(mp->m_eofblocks_workqueue, &mp->m_cowblocks_work, msecs_to_jiffies(xfs_cowb_secs * 1000)); @@ -1586,7 +1583,7 @@ xfs_cowblocks_worker( if (!sb_start_write_trylock(mp->m_super)) return; xfs_inode_walk(mp, 0, xfs_inode_free_cowblocks, NULL, - XFS_ICI_COWBLOCKS_TAG); + XFS_ICI_BLOCKGC_TAG); sb_end_write(mp->m_super);
xfs_queue_cowblocks(mp); @@ -1597,9 +1594,7 @@ xfs_inode_set_cowblocks_tag( xfs_inode_t *ip) { trace_xfs_inode_set_cowblocks_tag(ip); - return __xfs_inode_set_blocks_tag(ip, xfs_queue_cowblocks, - trace_xfs_perag_set_cowblocks, - XFS_ICI_COWBLOCKS_TAG); + return xfs_blockgc_set_iflag(ip, xfs_queue_cowblocks, XFS_ICOWBLOCKS); }
void @@ -1607,8 +1602,7 @@ xfs_inode_clear_cowblocks_tag( xfs_inode_t *ip) { trace_xfs_inode_clear_cowblocks_tag(ip); - return __xfs_inode_clear_blocks_tag(ip, - trace_xfs_perag_clear_cowblocks, XFS_ICI_COWBLOCKS_TAG); + return xfs_blockgc_clear_iflag(ip, XFS_ICOWBLOCKS); }
/* Disable post-EOF and CoW block auto-reclamation. */ @@ -1638,12 +1632,12 @@ xfs_blockgc_scan( int error;
error = xfs_inode_walk(mp, 0, xfs_inode_free_eofblocks, eofb, - XFS_ICI_EOFBLOCKS_TAG); + XFS_ICI_BLOCKGC_TAG); if (error) return error;
error = xfs_inode_walk(mp, 0, xfs_inode_free_cowblocks, eofb, - XFS_ICI_COWBLOCKS_TAG); + XFS_ICI_BLOCKGC_TAG); if (error) return error;
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index de7604576e9e..ab7107370be4 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -23,8 +23,8 @@ struct xfs_eofblocks { #define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup in xfs_inode_walk */ #define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */ -#define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */ -#define XFS_ICI_COWBLOCKS_TAG 2 /* inode can have cow blocks to gc */ +/* Inode has speculative preallocations (posteof or cow) to clean. */ +#define XFS_ICI_BLOCKGC_TAG 1
/* * Flags for xfs_iget() diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 4a0f4354992c..ab761b160e2c 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -137,10 +137,8 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag); DEFINE_PERAG_REF_EVENT(xfs_perag_put); DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim); DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); -DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks); -DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks); -DEFINE_PERAG_REF_EVENT(xfs_perag_set_cowblocks); -DEFINE_PERAG_REF_EVENT(xfs_perag_clear_cowblocks); +DEFINE_PERAG_REF_EVENT(xfs_perag_set_blockgc); +DEFINE_PERAG_REF_EVENT(xfs_perag_clear_blockgc);
DECLARE_EVENT_CLASS(xfs_ag_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno),
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit 9669f51de5c0c93e79257f690d1feaf16ebc179b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Remove the separate cowblocks work items and knob so that we can control and run everything from a single blockgc work queue. Note that the speculative_prealloc_lifetime sysfs knob retains its historical name even though the functions move to prefix xfs_blockgc_*.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_globals.c | 7 ++-- fs/xfs/xfs_icache.c | 96 ++++++++++++++------------------------------ fs/xfs/xfs_icache.h | 6 +-- fs/xfs/xfs_linux.h | 3 +- fs/xfs/xfs_mount.h | 6 +-- fs/xfs/xfs_super.c | 11 +++-- fs/xfs/xfs_sysctl.c | 15 ++----- fs/xfs/xfs_sysctl.h | 3 +- 8 files changed, 48 insertions(+), 99 deletions(-)
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index fa55ab8b8d80..f62fa652c2fd 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -8,8 +8,8 @@ /* * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n, * other XFS code uses these values. Times are measured in centisecs (i.e. - * 100ths of a second) with the exception of eofb_timer and cowb_timer, which - * are measured in seconds. + * 100ths of a second) with the exception of blockgc_timer, which is measured + * in seconds. */ xfs_param_t xfs_params = { /* MIN DFLT MAX */ @@ -28,8 +28,7 @@ xfs_param_t xfs_params = { .rotorstep = { 1, 1, 255 }, .inherit_nodfrg = { 0, 1, 1 }, .fstrm_timer = { 1, 30*100, 3600*100}, - .eofb_timer = { 1, 300, 3600*24}, - .cowb_timer = { 1, 1800, 3600*24}, + .blockgc_timer = { 1, 300, 3600*24}, };
struct xfs_globals xfs_globals = { diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 7249764b5900..30ddeedac3f3 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1328,41 +1328,24 @@ xfs_inode_free_eofblocks( }
/* - * Background scanning to trim post-EOF preallocated space. This is queued - * based on the 'speculative_prealloc_lifetime' tunable (5m by default). + * Background scanning to trim preallocated space. This is queued based on the + * 'speculative_prealloc_lifetime' tunable (5m by default). */ -void -xfs_queue_eofblocks( - struct xfs_mount *mp) +static inline void +xfs_blockgc_queue( + struct xfs_mount *mp) { rcu_read_lock(); if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_BLOCKGC_TAG)) - queue_delayed_work(mp->m_eofblocks_workqueue, - &mp->m_eofblocks_work, - msecs_to_jiffies(xfs_eofb_secs * 1000)); + queue_delayed_work(mp->m_blockgc_workqueue, + &mp->m_blockgc_work, + msecs_to_jiffies(xfs_blockgc_secs * 1000)); rcu_read_unlock(); }
-void -xfs_eofblocks_worker( - struct work_struct *work) -{ - struct xfs_mount *mp = container_of(to_delayed_work(work), - struct xfs_mount, m_eofblocks_work); - - if (!sb_start_write_trylock(mp->m_super)) - return; - xfs_inode_walk(mp, 0, xfs_inode_free_eofblocks, NULL, - XFS_ICI_BLOCKGC_TAG); - sb_end_write(mp->m_super); - - xfs_queue_eofblocks(mp); -} - static void xfs_blockgc_set_iflag( struct xfs_inode *ip, - void (*execute)(struct xfs_mount *mp), unsigned long iflag) { struct xfs_mount *mp = ip->i_mount; @@ -1397,7 +1380,7 @@ xfs_blockgc_set_iflag( spin_unlock(&ip->i_mount->m_perag_lock);
/* kick off background trimming */ - execute(ip->i_mount); + xfs_blockgc_queue(ip->i_mount);
trace_xfs_perag_set_blockgc(ip->i_mount, pag->pag_agno, -1, _RET_IP_); @@ -1412,7 +1395,7 @@ xfs_inode_set_eofblocks_tag( xfs_inode_t *ip) { trace_xfs_inode_set_eofblocks_tag(ip); - return xfs_blockgc_set_iflag(ip, xfs_queue_eofblocks, XFS_IEOFBLOCKS); + return xfs_blockgc_set_iflag(ip, XFS_IEOFBLOCKS); }
static void @@ -1556,45 +1539,12 @@ xfs_inode_free_cowblocks( return ret; }
-/* - * Background scanning to trim preallocated CoW space. This is queued - * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default). - * (We'll just piggyback on the post-EOF prealloc space workqueue.) - */ -void -xfs_queue_cowblocks( - struct xfs_mount *mp) -{ - rcu_read_lock(); - if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_BLOCKGC_TAG)) - queue_delayed_work(mp->m_eofblocks_workqueue, - &mp->m_cowblocks_work, - msecs_to_jiffies(xfs_cowb_secs * 1000)); - rcu_read_unlock(); -} - -void -xfs_cowblocks_worker( - struct work_struct *work) -{ - struct xfs_mount *mp = container_of(to_delayed_work(work), - struct xfs_mount, m_cowblocks_work); - - if (!sb_start_write_trylock(mp->m_super)) - return; - xfs_inode_walk(mp, 0, xfs_inode_free_cowblocks, NULL, - XFS_ICI_BLOCKGC_TAG); - sb_end_write(mp->m_super); - - xfs_queue_cowblocks(mp); -} - void xfs_inode_set_cowblocks_tag( xfs_inode_t *ip) { trace_xfs_inode_set_cowblocks_tag(ip); - return xfs_blockgc_set_iflag(ip, xfs_queue_cowblocks, XFS_ICOWBLOCKS); + return xfs_blockgc_set_iflag(ip, XFS_ICOWBLOCKS); }
void @@ -1610,8 +1560,7 @@ void xfs_stop_block_reaping( struct xfs_mount *mp) { - cancel_delayed_work_sync(&mp->m_eofblocks_work); - cancel_delayed_work_sync(&mp->m_cowblocks_work); + cancel_delayed_work_sync(&mp->m_blockgc_work); }
/* Enable post-EOF and CoW block auto-reclamation. */ @@ -1619,8 +1568,7 @@ void xfs_start_block_reaping( struct xfs_mount *mp) { - xfs_queue_eofblocks(mp); - xfs_queue_cowblocks(mp); + xfs_blockgc_queue(mp); }
/* Scan all incore inodes for block preallocations that we can remove. */ @@ -1644,6 +1592,24 @@ xfs_blockgc_scan( return 0; }
+/* Background worker that trims preallocated space. */ +void +xfs_blockgc_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_blockgc_work); + int error; + + if (!sb_start_write_trylock(mp->m_super)) + return; + error = xfs_blockgc_scan(mp, NULL); + if (error) + xfs_info(mp, "preallocation gc worker failed, err=%d", error); + sb_end_write(mp->m_super); + xfs_blockgc_queue(mp); +} + /* * Try to free space in the filesystem by purging eofblocks and cowblocks. */ diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index ab7107370be4..5c57476287f6 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -62,13 +62,11 @@ int xfs_blockgc_free_space(struct xfs_mount *mp, struct xfs_eofblocks *eofb);
void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); -void xfs_eofblocks_worker(struct work_struct *); -void xfs_queue_eofblocks(struct xfs_mount *);
void xfs_inode_set_cowblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip); -void xfs_cowblocks_worker(struct work_struct *); -void xfs_queue_cowblocks(struct xfs_mount *); + +void xfs_blockgc_worker(struct work_struct *work);
int xfs_inode_walk(struct xfs_mount *mp, int iter_flags, int (*execute)(struct xfs_inode *ip, void *args), diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 08e3bd3f859a..953d98bc4832 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -98,8 +98,7 @@ typedef __u32 xfs_nlink_t; #define xfs_rotorstep xfs_params.rotorstep.val #define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val #define xfs_fstrm_centisecs xfs_params.fstrm_timer.val -#define xfs_eofb_secs xfs_params.eofb_timer.val -#define xfs_cowb_secs xfs_params.cowb_timer.val +#define xfs_blockgc_secs xfs_params.blockgc_timer.val
#define current_cpu() (raw_smp_processor_id()) #define current_set_flags_nested(sp, f) \ diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 481d1e52b541..f5ba5645ccd8 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -94,7 +94,7 @@ typedef struct xfs_mount { struct workqueue_struct *m_unwritten_workqueue; struct workqueue_struct *m_cil_workqueue; struct workqueue_struct *m_reclaim_workqueue; - struct workqueue_struct *m_eofblocks_workqueue; + struct workqueue_struct *m_blockgc_workqueue; struct workqueue_struct *m_sync_workqueue;
int m_bsize; /* fs logical block size */ @@ -178,9 +178,7 @@ typedef struct xfs_mount { uint64_t m_resblks_avail;/* available reserved blocks */ uint64_t m_resblks_save; /* reserved blks @ remount,ro */ struct delayed_work m_reclaim_work; /* background inode reclaim */ - struct delayed_work m_eofblocks_work; /* background eof blocks - trimming */ - struct delayed_work m_cowblocks_work; /* background cow blocks + struct delayed_work m_blockgc_work; /* background prealloc blocks trimming */ struct xfs_kobj m_kobj; struct xfs_kobj m_error_kobj; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 743ffa901706..f2e3b6f956e8 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -538,10 +538,10 @@ xfs_init_mount_workqueues( if (!mp->m_reclaim_workqueue) goto out_destroy_cil;
- mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s", + mp->m_blockgc_workqueue = alloc_workqueue("xfs-blockgc/%s", XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), 0, mp->m_super->s_id); - if (!mp->m_eofblocks_workqueue) + if (!mp->m_blockgc_workqueue) goto out_destroy_reclaim;
mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", @@ -552,7 +552,7 @@ xfs_init_mount_workqueues( return 0;
out_destroy_eofb: - destroy_workqueue(mp->m_eofblocks_workqueue); + destroy_workqueue(mp->m_blockgc_workqueue); out_destroy_reclaim: destroy_workqueue(mp->m_reclaim_workqueue); out_destroy_cil: @@ -570,7 +570,7 @@ xfs_destroy_mount_workqueues( struct xfs_mount *mp) { destroy_workqueue(mp->m_sync_workqueue); - destroy_workqueue(mp->m_eofblocks_workqueue); + destroy_workqueue(mp->m_blockgc_workqueue); destroy_workqueue(mp->m_reclaim_workqueue); destroy_workqueue(mp->m_cil_workqueue); destroy_workqueue(mp->m_unwritten_workqueue); @@ -1866,8 +1866,7 @@ static int xfs_init_fs_context( mutex_init(&mp->m_growlock); INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); - INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); - INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker); + INIT_DELAYED_WORK(&mp->m_blockgc_work, xfs_blockgc_worker); mp->m_kobj.kobject.kset = xfs_kset; /* * We don't create the finobt per-ag space reservation until after log diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index fac9de7ee6d0..145e06c47744 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -194,21 +194,12 @@ static struct ctl_table xfs_table[] = { }, { .procname = "speculative_prealloc_lifetime", - .data = &xfs_params.eofb_timer.val, + .data = &xfs_params.blockgc_timer.val, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &xfs_params.eofb_timer.min, - .extra2 = &xfs_params.eofb_timer.max, - }, - { - .procname = "speculative_cow_prealloc_lifetime", - .data = &xfs_params.cowb_timer.val, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &xfs_params.cowb_timer.min, - .extra2 = &xfs_params.cowb_timer.max, + .extra1 = &xfs_params.blockgc_timer.min, + .extra2 = &xfs_params.blockgc_timer.max, }, /* please keep this the last entry */ #ifdef CONFIG_PROC_FS diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index 8abf4640f1d5..7692e76ead33 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -35,8 +35,7 @@ typedef struct xfs_param { xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */ xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */ xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */ - xfs_sysctl_val_t eofb_timer; /* Interval between eofb scan wakeups */ - xfs_sysctl_val_t cowb_timer; /* Interval between cowb scan wakeups */ + xfs_sysctl_val_t blockgc_timer; /* Interval between blockgc scans */ } xfs_param_t;
/*
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit 419567534e16eb553e7c19eecaa4d03cbc6693be category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Perform background block preallocation gc scans more efficiently by walking the incore inode tree once.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 30ddeedac3f3..40abf9ea0d57 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1571,21 +1571,19 @@ xfs_start_block_reaping( xfs_blockgc_queue(mp); }
-/* Scan all incore inodes for block preallocations that we can remove. */ -static inline int -xfs_blockgc_scan( - struct xfs_mount *mp, - struct xfs_eofblocks *eofb) +/* Scan one incore inode for block preallocations that we can remove. */ +static int +xfs_blockgc_scan_inode( + struct xfs_inode *ip, + void *args) { int error;
- error = xfs_inode_walk(mp, 0, xfs_inode_free_eofblocks, eofb, - XFS_ICI_BLOCKGC_TAG); + error = xfs_inode_free_eofblocks(ip, args); if (error) return error;
- error = xfs_inode_walk(mp, 0, xfs_inode_free_cowblocks, eofb, - XFS_ICI_BLOCKGC_TAG); + error = xfs_inode_free_cowblocks(ip, args); if (error) return error;
@@ -1603,7 +1601,8 @@ xfs_blockgc_worker(
if (!sb_start_write_trylock(mp->m_super)) return; - error = xfs_blockgc_scan(mp, NULL); + error = xfs_inode_walk(mp, 0, xfs_blockgc_scan_inode, NULL, + XFS_ICI_BLOCKGC_TAG); if (error) xfs_info(mp, "preallocation gc worker failed, err=%d", error); sb_end_write(mp->m_super); @@ -1620,7 +1619,8 @@ xfs_blockgc_free_space( { trace_xfs_blockgc_free_space(mp, eofb, _RET_IP_);
- return xfs_blockgc_scan(mp, eofb); + return xfs_inode_walk(mp, 0, xfs_blockgc_scan_inode, eofb, + XFS_ICI_BLOCKGC_TAG); }
/*
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit c9a6526fe7ae64528d924c6f255af15312211432 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Shorten the names of the two functions that start and stop block preallocation garbage collection and move them up to the other blockgc functions.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/scrub/common.c | 4 ++-- fs/xfs/xfs_icache.c | 4 ++-- fs/xfs/xfs_icache.h | 4 ++-- fs/xfs/xfs_mount.c | 2 +- fs/xfs/xfs_super.c | 8 ++++---- 5 files changed, 11 insertions(+), 11 deletions(-)
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 54b5e9a61455..922c1d84006b 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -904,7 +904,7 @@ xchk_stop_reaping( struct xfs_scrub *sc) { sc->flags |= XCHK_REAPING_DISABLED; - xfs_stop_block_reaping(sc->mp); + xfs_blockgc_stop(sc->mp); }
/* Restart background reaping of resources. */ @@ -912,6 +912,6 @@ void xchk_start_reaping( struct xfs_scrub *sc) { - xfs_start_block_reaping(sc->mp); + xfs_blockgc_start(sc->mp); sc->flags &= ~XCHK_REAPING_DISABLED; } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 40abf9ea0d57..03330c1914aa 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1557,7 +1557,7 @@ xfs_inode_clear_cowblocks_tag(
/* Disable post-EOF and CoW block auto-reclamation. */ void -xfs_stop_block_reaping( +xfs_blockgc_stop( struct xfs_mount *mp) { cancel_delayed_work_sync(&mp->m_blockgc_work); @@ -1565,7 +1565,7 @@ xfs_stop_block_reaping(
/* Enable post-EOF and CoW block auto-reclamation. */ void -xfs_start_block_reaping( +xfs_blockgc_start( struct xfs_mount *mp) { xfs_blockgc_queue(mp); diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 5c57476287f6..d1fddb152420 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -75,7 +75,7 @@ int xfs_inode_walk(struct xfs_mount *mp, int iter_flags, int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, bool *inuse);
-void xfs_stop_block_reaping(struct xfs_mount *mp); -void xfs_start_block_reaping(struct xfs_mount *mp); +void xfs_blockgc_stop(struct xfs_mount *mp); +void xfs_blockgc_start(struct xfs_mount *mp);
#endif diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index ed488852188f..abc1335d9b3d 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1062,7 +1062,7 @@ xfs_unmountfs( uint64_t resblks; int error;
- xfs_stop_block_reaping(mp); + xfs_blockgc_stop(mp); xfs_fs_unreserve_ag_blocks(mp); xfs_qm_unmount_quotas(mp); xfs_rtunmount_inodes(mp); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index f2e3b6f956e8..8982776522d3 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -944,7 +944,7 @@ xfs_fs_freeze( * set a GFP_NOFS context here to avoid recursion deadlocks. */ flags = memalloc_nofs_save(); - xfs_stop_block_reaping(mp); + xfs_blockgc_stop(mp); xfs_save_resvblks(mp); xfs_quiesce_attr(mp); ret = xfs_sync_sb(mp, true); @@ -960,7 +960,7 @@ xfs_fs_unfreeze(
xfs_restore_resvblks(mp); xfs_log_work_queue(mp); - xfs_start_block_reaping(mp); + xfs_blockgc_start(mp); return 0; }
@@ -1714,7 +1714,7 @@ xfs_remount_rw( xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return error; } - xfs_start_block_reaping(mp); + xfs_blockgc_start(mp);
/* Create the per-AG metadata reservation pool .*/ error = xfs_fs_reserve_ag_blocks(mp); @@ -1734,7 +1734,7 @@ xfs_remount_ro( * Cancel background eofb scanning so it cannot race with the final * log force+buftarg wait and deadlock the remount. */ - xfs_stop_block_reaping(mp); + xfs_blockgc_stop(mp);
/* Get rid of any leftover CoW reservations... */ error = xfs_blockgc_free_space(mp, NULL);
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit 894ecacf0f27fd1701c34f2946148b7f017bf984 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Split the block preallocation garbage collection work into per-AG work items so that we can take advantage of parallelization.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 42 ++++++++++++++++++++++++++++++------------ fs/xfs/xfs_mount.c | 3 +++ fs/xfs/xfs_mount.h | 5 +++-- fs/xfs/xfs_super.c | 4 ++-- 4 files changed, 38 insertions(+), 16 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 03330c1914aa..e3a94178e6a8 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1333,12 +1333,12 @@ xfs_inode_free_eofblocks( */ static inline void xfs_blockgc_queue( - struct xfs_mount *mp) + struct xfs_perag *pag) { rcu_read_lock(); - if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_BLOCKGC_TAG)) - queue_delayed_work(mp->m_blockgc_workqueue, - &mp->m_blockgc_work, + if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) + queue_delayed_work(pag->pag_mount->m_blockgc_workqueue, + &pag->pag_blockgc_work, msecs_to_jiffies(xfs_blockgc_secs * 1000)); rcu_read_unlock(); } @@ -1380,7 +1380,7 @@ xfs_blockgc_set_iflag( spin_unlock(&ip->i_mount->m_perag_lock);
/* kick off background trimming */ - xfs_blockgc_queue(ip->i_mount); + xfs_blockgc_queue(pag);
trace_xfs_perag_set_blockgc(ip->i_mount, pag->pag_agno, -1, _RET_IP_); @@ -1555,12 +1555,24 @@ xfs_inode_clear_cowblocks_tag( return xfs_blockgc_clear_iflag(ip, XFS_ICOWBLOCKS); }
+#define for_each_perag_tag(mp, next_agno, pag, tag) \ + for ((next_agno) = 0, (pag) = xfs_perag_get_tag((mp), 0, (tag)); \ + (pag) != NULL; \ + (next_agno) = (pag)->pag_agno + 1, \ + xfs_perag_put(pag), \ + (pag) = xfs_perag_get_tag((mp), (next_agno), (tag))) + + /* Disable post-EOF and CoW block auto-reclamation. */ void xfs_blockgc_stop( struct xfs_mount *mp) { - cancel_delayed_work_sync(&mp->m_blockgc_work); + struct xfs_perag *pag; + xfs_agnumber_t agno; + + for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) + cancel_delayed_work_sync(&pag->pag_blockgc_work); }
/* Enable post-EOF and CoW block auto-reclamation. */ @@ -1568,7 +1580,11 @@ void xfs_blockgc_start( struct xfs_mount *mp) { - xfs_blockgc_queue(mp); + struct xfs_perag *pag; + xfs_agnumber_t agno; + + for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) + xfs_blockgc_queue(pag); }
/* Scan one incore inode for block preallocations that we can remove. */ @@ -1595,18 +1611,20 @@ void xfs_blockgc_worker( struct work_struct *work) { - struct xfs_mount *mp = container_of(to_delayed_work(work), - struct xfs_mount, m_blockgc_work); + struct xfs_perag *pag = container_of(to_delayed_work(work), + struct xfs_perag, pag_blockgc_work); + struct xfs_mount *mp = pag->pag_mount; int error;
if (!sb_start_write_trylock(mp->m_super)) return; - error = xfs_inode_walk(mp, 0, xfs_blockgc_scan_inode, NULL, + error = xfs_inode_walk_ag(pag, 0, xfs_blockgc_scan_inode, NULL, XFS_ICI_BLOCKGC_TAG); if (error) - xfs_info(mp, "preallocation gc worker failed, err=%d", error); + xfs_info(mp, "AG %u preallocation gc worker failed, err=%d", + pag->pag_agno, error); sb_end_write(mp->m_super); - xfs_blockgc_queue(mp); + xfs_blockgc_queue(pag); }
/* diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index abc1335d9b3d..6845d9d432b4 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -126,6 +126,7 @@ __xfs_free_perag( { struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
+ ASSERT(!delayed_work_pending(&pag->pag_blockgc_work)); ASSERT(atomic_read(&pag->pag_ref) == 0); kmem_free(pag); } @@ -146,6 +147,7 @@ xfs_free_perag( spin_unlock(&mp->m_perag_lock); ASSERT(pag); ASSERT(atomic_read(&pag->pag_ref) == 0); + cancel_delayed_work_sync(&pag->pag_blockgc_work); xfs_iunlink_destroy(pag); xfs_buf_hash_destroy(pag); call_rcu(&pag->rcu_head, __xfs_free_perag); @@ -201,6 +203,7 @@ xfs_initialize_perag( pag->pag_agno = index; pag->pag_mount = mp; spin_lock_init(&pag->pag_ici_lock); + INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker); INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
error = xfs_buf_hash_init(pag); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index f5ba5645ccd8..9465508bddb2 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -178,8 +178,6 @@ typedef struct xfs_mount { uint64_t m_resblks_avail;/* available reserved blocks */ uint64_t m_resblks_save; /* reserved blks @ remount,ro */ struct delayed_work m_reclaim_work; /* background inode reclaim */ - struct delayed_work m_blockgc_work; /* background prealloc blocks - trimming */ struct xfs_kobj m_kobj; struct xfs_kobj m_error_kobj; struct xfs_kobj m_error_meta_kobj; @@ -368,6 +366,9 @@ typedef struct xfs_perag { /* Blocks reserved for the reverse mapping btree. */ struct xfs_ag_resv pag_rmapbt_resv;
+ /* background prealloc block trimming */ + struct delayed_work pag_blockgc_work; + /* reference count */ uint8_t pagf_refcount_level;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 8982776522d3..333fe39467f3 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -35,6 +35,7 @@ #include "xfs_refcount_item.h" #include "xfs_bmap_item.h" #include "xfs_reflink.h" +#include "xfs_pwork.h"
#include <linux/magic.h> #include <linux/fs_context.h> @@ -539,7 +540,7 @@ xfs_init_mount_workqueues( goto out_destroy_cil;
mp->m_blockgc_workqueue = alloc_workqueue("xfs-blockgc/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + XFS_WQFLAGS(WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM), 0, mp->m_super->s_id); if (!mp->m_blockgc_workqueue) goto out_destroy_reclaim; @@ -1866,7 +1867,6 @@ static int xfs_init_fs_context( mutex_init(&mp->m_growlock); INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); - INIT_DELAYED_WORK(&mp->m_blockgc_work, xfs_blockgc_worker); mp->m_kobj.kobject.kset = xfs_kset; /* * We don't create the finobt per-ag space reservation until after log
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.12-rc4 commit 383e32b0d0db464dc53052a97bf7f9ee3a1937cc category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Files containing metadata (quota records, rt bitmap and summary info) are fully managed by the filesystem, which means that all resource cleanup must be explicit, not automatic. This means that they should never be subjected automatic to post-eof truncation, nor should they be freed automatically even if the link count drops to zero.
In other words, xfs_inactive() should leave these files alone. Add the necessary predicate functions to make this happen. This adds a second layer of prevention for the kinds of fs corruption that was fixed by commit f4c32e87de7d. If we ever decide to support removing metadata files, we should make all those metadata updates explicit.
Rearrange the order of #includes to fix compiler errors, since xfs_mount.h is supposed to be included before xfs_inode.h
Followup-to: f4c32e87de7d ("xfs: fix realtime bitmap/summary file truncation when growing rt volume") Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/libxfs/xfs_iext_tree.c | 2 +- fs/xfs/xfs_inode.c | 4 ++++ fs/xfs/xfs_inode.h | 8 ++++++++ fs/xfs/xfs_xattr.c | 2 ++ 4 files changed, 15 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c index b4164256993d..773cf4349428 100644 --- a/fs/xfs/libxfs/xfs_iext_tree.c +++ b/fs/xfs/libxfs/xfs_iext_tree.c @@ -8,9 +8,9 @@ #include "xfs_format.h" #include "xfs_bit.h" #include "xfs_log_format.h" -#include "xfs_inode.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_inode.h" #include "xfs_trace.h"
/* diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 5983eee137bd..11114f48032e 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1813,6 +1813,10 @@ xfs_inactive( if (mp->m_flags & XFS_MOUNT_RDONLY) return;
+ /* Metadata inodes require explicit resource cleanup. */ + if (xfs_is_metadata_inode(ip)) + return; + /* Try to clean out the cow blocks if there are any. */ if (xfs_inode_has_cow_data(ip)) xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 751a3d1d7d84..51cb2ba5462f 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -185,6 +185,14 @@ static inline bool xfs_is_reflink_inode(struct xfs_inode *ip) return ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK; }
+static inline bool xfs_is_metadata_inode(struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + return ip == mp->m_rbmip || ip == mp->m_rsumip || + xfs_is_quota_inode(&mp->m_sb, ip->i_ino); +} + /* * Check if an inode has any data in the COW fork. This might be often false * even for inodes with the reflink flag when there is no pending COW operation. diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index bca48b308c02..d0348bd63cce 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -9,6 +9,8 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_da_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_attr.h" #include "xfs_acl.h"
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.12-rc4 commit 3fef46fc43ca12a0006d6683c8ac114628ad53a1 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Since we're about to start using the blockgc workqueue to dispose of inactivated inodes, strip the "block" prefix from the name; now it's merely the general garbage collection (gc) workqueue.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- Documentation/admin-guide/xfs.rst | 3 +++ fs/xfs/xfs_icache.c | 2 +- fs/xfs/xfs_mount.h | 2 +- fs/xfs/xfs_super.c | 10 +++++----- 4 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst index b00b1eece9de..a15142fdc86f 100644 --- a/Documentation/admin-guide/xfs.rst +++ b/Documentation/admin-guide/xfs.rst @@ -518,6 +518,9 @@ and the short name of the data device. They all can be found in: ================ =========== xfs_iwalk-$pid Inode scans of the entire filesystem. Currently limited to mount time quotacheck. + xfs-gc Background garbage collection of disk space that have been + speculatively allocated beyond EOF or for staging copy on + write operations. ================ ===========
For example, the knobs for the quotacheck workqueue for /dev/nvme0n1 would be diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index e3a94178e6a8..3d1e379ccf51 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1337,7 +1337,7 @@ xfs_blockgc_queue( { rcu_read_lock(); if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) - queue_delayed_work(pag->pag_mount->m_blockgc_workqueue, + queue_delayed_work(pag->pag_mount->m_gc_workqueue, &pag->pag_blockgc_work, msecs_to_jiffies(xfs_blockgc_secs * 1000)); rcu_read_unlock(); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 9465508bddb2..3bae88b45fca 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -94,7 +94,7 @@ typedef struct xfs_mount { struct workqueue_struct *m_unwritten_workqueue; struct workqueue_struct *m_cil_workqueue; struct workqueue_struct *m_reclaim_workqueue; - struct workqueue_struct *m_blockgc_workqueue; + struct workqueue_struct *m_gc_workqueue; struct workqueue_struct *m_sync_workqueue;
int m_bsize; /* fs logical block size */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 333fe39467f3..8953d4bfb61a 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -539,10 +539,10 @@ xfs_init_mount_workqueues( if (!mp->m_reclaim_workqueue) goto out_destroy_cil;
- mp->m_blockgc_workqueue = alloc_workqueue("xfs-blockgc/%s", - XFS_WQFLAGS(WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM), + mp->m_gc_workqueue = alloc_workqueue("xfs-gc/%s", + WQ_SYSFS | WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM, 0, mp->m_super->s_id); - if (!mp->m_blockgc_workqueue) + if (!mp->m_gc_workqueue) goto out_destroy_reclaim;
mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", @@ -553,7 +553,7 @@ xfs_init_mount_workqueues( return 0;
out_destroy_eofb: - destroy_workqueue(mp->m_blockgc_workqueue); + destroy_workqueue(mp->m_gc_workqueue); out_destroy_reclaim: destroy_workqueue(mp->m_reclaim_workqueue); out_destroy_cil: @@ -571,7 +571,7 @@ xfs_destroy_mount_workqueues( struct xfs_mount *mp) { destroy_workqueue(mp->m_sync_workqueue); - destroy_workqueue(mp->m_blockgc_workqueue); + destroy_workqueue(mp->m_gc_workqueue); destroy_workqueue(mp->m_reclaim_workqueue); destroy_workqueue(mp->m_cil_workqueue); destroy_workqueue(mp->m_unwritten_workqueue);
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit 47bd6d3457fb96d287278027aed8a78d14f1d32d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Expose the workqueue sysfs knobs for the speculative preallocation gc workers on all kernels, and update the sysadmin information.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- Documentation/admin-guide/xfs.rst | 3 ++- fs/xfs/xfs_super.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst index a15142fdc86f..4b9096a8ee5e 100644 --- a/Documentation/admin-guide/xfs.rst +++ b/Documentation/admin-guide/xfs.rst @@ -518,7 +518,8 @@ and the short name of the data device. They all can be found in: ================ =========== xfs_iwalk-$pid Inode scans of the entire filesystem. Currently limited to mount time quotacheck. - xfs-gc Background garbage collection of disk space that have been +======= + xfs-blockgc Background garbage collection of disk space that have been speculatively allocated beyond EOF or for staging copy on write operations. ================ =========== diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 8953d4bfb61a..46e148f5534d 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -539,7 +539,7 @@ xfs_init_mount_workqueues( if (!mp->m_reclaim_workqueue) goto out_destroy_cil;
- mp->m_gc_workqueue = alloc_workqueue("xfs-gc/%s", + mp->m_gc_workqueue = alloc_workqueue("xfs-blockgc/%s", WQ_SYSFS | WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM, 0, mp->m_super->s_id); if (!mp->m_gc_workqueue)
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit 0fa4a10a2f5f96a06373ea81f8cd5f97c5cc264f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Since xfs_inode_free_eofblocks and xfs_inode_free_cowblocks are now internal static functions, we can save ourselves a cycling of the iolock by passing the lock state out to xfs_blockgc_scan_inode and letting it do all the unlocking.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 3d1e379ccf51..b3db40835069 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1283,11 +1283,11 @@ xfs_reclaim_worker( STATIC int xfs_inode_free_eofblocks( struct xfs_inode *ip, - void *args) + void *args, + unsigned int *lockflags) { struct xfs_eofblocks *eofb = args; bool wait; - int ret;
wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC);
@@ -1320,11 +1320,9 @@ xfs_inode_free_eofblocks( return -EAGAIN; return 0; } + *lockflags |= XFS_IOLOCK_EXCL;
- ret = xfs_free_eofblocks(ip); - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - - return ret; + return xfs_free_eofblocks(ip); }
/* @@ -1493,7 +1491,8 @@ xfs_prep_free_cowblocks( STATIC int xfs_inode_free_cowblocks( struct xfs_inode *ip, - void *args) + void *args, + unsigned int *lockflags) { struct xfs_eofblocks *eofb = args; bool wait; @@ -1514,16 +1513,20 @@ xfs_inode_free_cowblocks( * If the caller is waiting, return -EAGAIN to keep the background * scanner moving and revisit the inode in a subsequent pass. */ - if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { + if (!(*lockflags & XFS_IOLOCK_EXCL) && + !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { if (wait) return -EAGAIN; return 0; } + *lockflags |= XFS_IOLOCK_EXCL; + if (!xfs_ilock_nowait(ip, XFS_MMAPLOCK_EXCL)) { if (wait) - ret = -EAGAIN; - goto out_iolock; + return -EAGAIN; + return 0; } + *lockflags |= XFS_MMAPLOCK_EXCL;
/* * Check again, nobody else should be able to dirty blocks or change @@ -1531,11 +1534,6 @@ xfs_inode_free_cowblocks( */ if (xfs_prep_free_cowblocks(ip)) ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false); - - xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); -out_iolock: - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return ret; }
@@ -1593,17 +1591,18 @@ xfs_blockgc_scan_inode( struct xfs_inode *ip, void *args) { + unsigned int lockflags = 0; int error;
- error = xfs_inode_free_eofblocks(ip, args); + error = xfs_inode_free_eofblocks(ip, args, &lockflags); if (error) - return error; + goto unlock;
- error = xfs_inode_free_cowblocks(ip, args); - if (error) - return error; - - return 0; + error = xfs_inode_free_cowblocks(ip, args, &lockflags); +unlock: + if (lockflags) + xfs_iunlock(ip, lockflags); + return error; }
/* Background worker that trims preallocated space. */
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.13-rc4 commit 1ad2cfe0a57031505df682dc1e26922d9d43737f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
The only external caller of xfs_inode_walk* happens in quotaoff, when we want to walk all the incore inodes to detach the dquots. Move this code to xfs_icache.c so that we can hide xfs_inode_walk as the starting step in more cleanups of inode walks.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 65 +++++++++++++++++++++++++++++++++++++++- fs/xfs/xfs_icache.h | 8 +++-- fs/xfs/xfs_qm.h | 1 - fs/xfs/xfs_qm_syscalls.c | 54 ++------------------------------- 4 files changed, 71 insertions(+), 57 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index b3db40835069..c0cf8c7c6449 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -26,6 +26,18 @@
#include <linux/iversion.h>
+/* + * Private inode cache walk flags for struct xfs_eofblocks. Must not coincide + * with XFS_EOF_FLAGS_*. + */ +#define XFS_ICWALK_FLAG_DROP_UDQUOT (1U << 31) +#define XFS_ICWALK_FLAG_DROP_GDQUOT (1U << 30) +#define XFS_ICWALK_FLAG_DROP_PDQUOT (1U << 29) + +#define XFS_ICWALK_PRIVATE_FLAGS (XFS_ICWALK_FLAG_DROP_UDQUOT | \ + XFS_ICWALK_FLAG_DROP_GDQUOT | \ + XFS_ICWALK_FLAG_DROP_PDQUOT) + /* * Allocate and initialise an xfs_inode. */ @@ -886,7 +898,7 @@ xfs_inode_walk_get_perag( * Call the @execute function on all incore inodes matching the radix tree * @tag. */ -int +static int xfs_inode_walk( struct xfs_mount *mp, int iter_flags, @@ -911,7 +923,58 @@ xfs_inode_walk( } } return last_error; + BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_EOF_FLAGS_VALID); +} + +#ifdef CONFIG_XFS_QUOTA +/* Drop this inode's dquots. */ +static int +xfs_dqrele_inode( + struct xfs_inode *ip, + void *priv) +{ + struct xfs_eofblocks *eofb = priv; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (eofb->eof_flags & XFS_ICWALK_FLAG_DROP_UDQUOT) { + xfs_qm_dqrele(ip->i_udquot); + ip->i_udquot = NULL; + } + if (eofb->eof_flags & XFS_ICWALK_FLAG_DROP_GDQUOT) { + xfs_qm_dqrele(ip->i_gdquot); + ip->i_gdquot = NULL; + } + if (eofb->eof_flags & XFS_ICWALK_FLAG_DROP_PDQUOT) { + xfs_qm_dqrele(ip->i_pdquot); + ip->i_pdquot = NULL; + } + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; +} + +/* + * Detach all dquots from incore inodes if we can. The caller must already + * have dropped the relevant XFS_[UGP]QUOTA_ACTIVE flags so that dquots will + * not get reattached. + */ +int +xfs_dqrele_all_inodes( + struct xfs_mount *mp, + unsigned int qflags) +{ + struct xfs_eofblocks eofb = { .eof_flags = 0 }; + + if (qflags & XFS_UQUOTA_ACCT) + eofb.eof_flags |= XFS_ICWALK_FLAG_DROP_UDQUOT; + if (qflags & XFS_GQUOTA_ACCT) + eofb.eof_flags |= XFS_ICWALK_FLAG_DROP_GDQUOT; + if (qflags & XFS_PQUOTA_ACCT) + eofb.eof_flags |= XFS_ICWALK_FLAG_DROP_PDQUOT; + + return xfs_inode_walk(mp, XFS_INODE_WALK_INEW_WAIT, xfs_dqrele_inode, + &eofb, XFS_ICI_NO_TAG); } +#endif /* CONFIG_XFS_QUOTA */
/* * Grab the inode for reclaim exclusively. diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index d1fddb152420..d9baa6df1121 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -68,9 +68,11 @@ void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip);
void xfs_blockgc_worker(struct work_struct *work);
-int xfs_inode_walk(struct xfs_mount *mp, int iter_flags, - int (*execute)(struct xfs_inode *ip, void *args), - void *args, int tag); +#ifdef CONFIG_XFS_QUOTA +int xfs_dqrele_all_inodes(struct xfs_mount *mp, unsigned int qflags); +#else +# define xfs_dqrele_all_inodes(mp, qflags) (0) +#endif
int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, bool *inuse); diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index e3dabab44097..ebbb484c49dc 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -142,7 +142,6 @@ extern void xfs_qm_destroy_quotainfo(struct xfs_mount *);
/* dquot stuff */ extern void xfs_qm_dqpurge_all(struct xfs_mount *, uint); -extern void xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint);
/* quota ops */ extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint); diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index ca1b57d291dc..24a04b8614a7 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -201,7 +201,8 @@ xfs_qm_scall_quotaoff( * depend on the quota inodes (and other things) being valid as long as * we keep the lock(s). */ - xfs_qm_dqrele_all_inodes(mp, flags); + error = xfs_dqrele_all_inodes(mp, flags); + ASSERT(!error);
/* * Next we make the changes in the quota flag in the mount struct. @@ -747,54 +748,3 @@ xfs_qm_scall_getquota_next( xfs_qm_dqput(dqp); return error; } - -STATIC int -xfs_dqrele_inode( - struct xfs_inode *ip, - void *args) -{ - uint *flags = args; - - /* skip quota inodes */ - if (ip == ip->i_mount->m_quotainfo->qi_uquotaip || - ip == ip->i_mount->m_quotainfo->qi_gquotaip || - ip == ip->i_mount->m_quotainfo->qi_pquotaip) { - ASSERT(ip->i_udquot == NULL); - ASSERT(ip->i_gdquot == NULL); - ASSERT(ip->i_pdquot == NULL); - return 0; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL); - if ((*flags & XFS_UQUOTA_ACCT) && ip->i_udquot) { - xfs_qm_dqrele(ip->i_udquot); - ip->i_udquot = NULL; - } - if ((*flags & XFS_GQUOTA_ACCT) && ip->i_gdquot) { - xfs_qm_dqrele(ip->i_gdquot); - ip->i_gdquot = NULL; - } - if ((*flags & XFS_PQUOTA_ACCT) && ip->i_pdquot) { - xfs_qm_dqrele(ip->i_pdquot); - ip->i_pdquot = NULL; - } - xfs_iunlock(ip, XFS_ILOCK_EXCL); - return 0; -} - - -/* - * Go thru all the inodes in the file system, releasing their dquots. - * - * Note that the mount structure gets modified to indicate that quotas are off - * AFTER this, in the case of quotaoff. - */ -void -xfs_qm_dqrele_all_inodes( - struct xfs_mount *mp, - uint flags) -{ - ASSERT(mp->m_quotainfo); - xfs_inode_walk(mp, XFS_INODE_WALK_INEW_WAIT, xfs_dqrele_inode, - &flags, XFS_ICI_NO_TAG); -}
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.13-rc4 commit 3ea06d73e3c02ee2952a62bf92abc18f9c98aba1 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Once we're done with inactivating an inode, we're finished updating metadata for that inode. This means that we can detach the dquots at the end and not have to wait for reclaim to do it for us.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 2 +- fs/xfs/xfs_inode.c | 22 +++++++++++----------- 2 files changed, 12 insertions(+), 12 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index c0cf8c7c6449..958102d6441d 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1091,7 +1091,7 @@ xfs_reclaim_inode( * unlocked after the lookup before we go ahead and free it. */ xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_qm_dqdetach(ip); + ASSERT(!ip->i_udquot && !ip->i_gdquot && !ip->i_pdquot); xfs_iunlock(ip, XFS_ILOCK_EXCL); ASSERT(xfs_inode_clean(ip));
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 11114f48032e..4b7228e2e39e 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1803,7 +1803,7 @@ xfs_inactive( */ if (VFS_I(ip)->i_mode == 0) { ASSERT(ip->i_df.if_broot_bytes == 0); - return; + goto out; }
mp = ip->i_mount; @@ -1811,11 +1811,11 @@ xfs_inactive(
/* If this is a read-only mount, don't do this (would generate I/O) */ if (mp->m_flags & XFS_MOUNT_RDONLY) - return; + goto out;
/* Metadata inodes require explicit resource cleanup. */ if (xfs_is_metadata_inode(ip)) - return; + goto out;
/* Try to clean out the cow blocks if there are any. */ if (xfs_inode_has_cow_data(ip)) @@ -1834,7 +1834,7 @@ xfs_inactive( if (xfs_can_free_eofblocks(ip, true)) xfs_free_eofblocks(ip);
- return; + goto out; }
if (S_ISREG(VFS_I(ip)->i_mode) && @@ -1844,14 +1844,14 @@ xfs_inactive(
error = xfs_qm_dqattach(ip); if (error) - return; + goto out;
if (S_ISLNK(VFS_I(ip)->i_mode)) error = xfs_inactive_symlink(ip); else if (truncate) error = xfs_inactive_truncate(ip); if (error) - return; + goto out;
/* * If there are attributes associated with the file then blow them away @@ -1861,7 +1861,7 @@ xfs_inactive( if (XFS_IFORK_Q(ip)) { error = xfs_attr_inactive(ip); if (error) - return; + goto out; }
ASSERT(!ip->i_afp); @@ -1870,12 +1870,12 @@ xfs_inactive( /* * Free the inode. */ - error = xfs_inactive_ifree(ip); - if (error) - return; + xfs_inactive_ifree(ip);
+out: /* - * Release the dquots held by inode, if any. + * We're done making metadata updates for this inode, so we can release + * the attached dquots. */ xfs_qm_dqdetach(ip); }
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.13-rc4 commit df60019739d8850b865d313053d30aa93dc38a65 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Move the inode walk functions further down in the file to limit the forward declarations to the two walk functions as we add new code that uses the inode walks. We'll clean them out later (i.e. after the deferred inode inactivation series).
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 401 +++++++++++++++++++++++--------------------- 1 file changed, 206 insertions(+), 195 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 958102d6441d..17351c242201 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -26,6 +26,13 @@
#include <linux/iversion.h>
+static int xfs_inode_walk(struct xfs_mount *mp, int iter_flags, + int (*execute)(struct xfs_inode *ip, void *args), + void *args, int tag); +static int xfs_inode_walk_ag(struct xfs_perag *pag, int iter_flags, + int (*execute)(struct xfs_inode *ip, void *args), + void *args, int tag); + /* * Private inode cache walk flags for struct xfs_eofblocks. Must not coincide * with XFS_EOF_FLAGS_*. @@ -728,204 +735,12 @@ xfs_icache_inode_is_allocated( * radix tree lookups to a minimum. The batch size is a trade off between * lookup reduction and stack usage. This is in the reclaim path, so we can't * be too greedy. + * + * XXX: This will be moved closer to xfs_inode_walk* once we get rid of the + * separate reclaim walk functions. */ #define XFS_LOOKUP_BATCH 32
-/* - * Decide if the given @ip is eligible to be a part of the inode walk, and - * grab it if so. Returns true if it's ready to go or false if we should just - * ignore it. - */ -STATIC bool -xfs_inode_walk_ag_grab( - struct xfs_inode *ip, - int flags) -{ - struct inode *inode = VFS_I(ip); - bool newinos = !!(flags & XFS_INODE_WALK_INEW_WAIT); - - ASSERT(rcu_read_lock_held()); - - /* Check for stale RCU freed inode */ - spin_lock(&ip->i_flags_lock); - if (!ip->i_ino) - goto out_unlock_noent; - - /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ - if ((!newinos && __xfs_iflags_test(ip, XFS_INEW)) || - __xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM)) - goto out_unlock_noent; - spin_unlock(&ip->i_flags_lock); - - /* nothing to sync during shutdown */ - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return false; - - /* If we can't grab the inode, it must on it's way to reclaim. */ - if (!igrab(inode)) - return false; - - /* inode is valid */ - return true; - -out_unlock_noent: - spin_unlock(&ip->i_flags_lock); - return false; -} - -/* - * For a given per-AG structure @pag, grab, @execute, and rele all incore - * inodes with the given radix tree @tag. - */ -STATIC int -xfs_inode_walk_ag( - struct xfs_perag *pag, - int iter_flags, - int (*execute)(struct xfs_inode *ip, void *args), - void *args, - int tag) -{ - struct xfs_mount *mp = pag->pag_mount; - uint32_t first_index; - int last_error = 0; - int skipped; - bool done; - int nr_found; - -restart: - done = false; - skipped = 0; - first_index = 0; - nr_found = 0; - do { - struct xfs_inode *batch[XFS_LOOKUP_BATCH]; - int error = 0; - int i; - - rcu_read_lock(); - - if (tag == XFS_ICI_NO_TAG) - nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, - (void **)batch, first_index, - XFS_LOOKUP_BATCH); - else - nr_found = radix_tree_gang_lookup_tag( - &pag->pag_ici_root, - (void **) batch, first_index, - XFS_LOOKUP_BATCH, tag); - - if (!nr_found) { - rcu_read_unlock(); - break; - } - - /* - * Grab the inodes before we drop the lock. if we found - * nothing, nr == 0 and the loop will be skipped. - */ - for (i = 0; i < nr_found; i++) { - struct xfs_inode *ip = batch[i]; - - if (done || !xfs_inode_walk_ag_grab(ip, iter_flags)) - batch[i] = NULL; - - /* - * Update the index for the next lookup. Catch - * overflows into the next AG range which can occur if - * we have inodes in the last block of the AG and we - * are currently pointing to the last inode. - * - * Because we may see inodes that are from the wrong AG - * due to RCU freeing and reallocation, only update the - * index if it lies in this AG. It was a race that lead - * us to see this inode, so another lookup from the - * same index will not find it again. - */ - if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) - continue; - first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); - if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) - done = true; - } - - /* unlock now we've grabbed the inodes. */ - rcu_read_unlock(); - - for (i = 0; i < nr_found; i++) { - if (!batch[i]) - continue; - if ((iter_flags & XFS_INODE_WALK_INEW_WAIT) && - xfs_iflags_test(batch[i], XFS_INEW)) - xfs_inew_wait(batch[i]); - error = execute(batch[i], args); - xfs_irele(batch[i]); - if (error == -EAGAIN) { - skipped++; - continue; - } - if (error && last_error != -EFSCORRUPTED) - last_error = error; - } - - /* bail out if the filesystem is corrupted. */ - if (error == -EFSCORRUPTED) - break; - - cond_resched(); - - } while (nr_found && !done); - - if (skipped) { - delay(1); - goto restart; - } - return last_error; -} - -/* Fetch the next (possibly tagged) per-AG structure. */ -static inline struct xfs_perag * -xfs_inode_walk_get_perag( - struct xfs_mount *mp, - xfs_agnumber_t agno, - int tag) -{ - if (tag == XFS_ICI_NO_TAG) - return xfs_perag_get(mp, agno); - return xfs_perag_get_tag(mp, agno, tag); -} - -/* - * Call the @execute function on all incore inodes matching the radix tree - * @tag. - */ -static int -xfs_inode_walk( - struct xfs_mount *mp, - int iter_flags, - int (*execute)(struct xfs_inode *ip, void *args), - void *args, - int tag) -{ - struct xfs_perag *pag; - int error = 0; - int last_error = 0; - xfs_agnumber_t ag; - - ag = 0; - while ((pag = xfs_inode_walk_get_perag(mp, ag, tag))) { - ag = pag->pag_agno + 1; - error = xfs_inode_walk_ag(pag, iter_flags, execute, args, tag); - xfs_perag_put(pag); - if (error) { - last_error = error; - if (error == -EFSCORRUPTED) - break; - } - } - return last_error; - BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_EOF_FLAGS_VALID); -} - #ifdef CONFIG_XFS_QUOTA /* Drop this inode's dquots. */ static int @@ -1648,6 +1463,48 @@ xfs_blockgc_start( xfs_blockgc_queue(pag); }
+/* + * Decide if the given @ip is eligible to be a part of the inode walk, and + * grab it if so. Returns true if it's ready to go or false if we should just + * ignore it. + */ +static bool +xfs_inode_walk_ag_grab( + struct xfs_inode *ip, + int flags) +{ + struct inode *inode = VFS_I(ip); + bool newinos = !!(flags & XFS_INODE_WALK_INEW_WAIT); + + ASSERT(rcu_read_lock_held()); + + /* Check for stale RCU freed inode */ + spin_lock(&ip->i_flags_lock); + if (!ip->i_ino) + goto out_unlock_noent; + + /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ + if ((!newinos && __xfs_iflags_test(ip, XFS_INEW)) || + __xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM)) + goto out_unlock_noent; + spin_unlock(&ip->i_flags_lock); + + /* nothing to sync during shutdown */ + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return false; + + /* If we can't grab the inode, it must on it's way to reclaim. */ + if (!igrab(inode)) + return false; + + /* inode is valid */ + return true; + +out_unlock_noent: + spin_unlock(&ip->i_flags_lock); + return false; +} + /* Scan one incore inode for block preallocations that we can remove. */ static int xfs_blockgc_scan_inode( @@ -1768,3 +1625,157 @@ xfs_blockgc_free_quota( xfs_inode_dquot(ip, XFS_DQTYPE_GROUP), xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), eof_flags); } + +/* XFS Inode Cache Walking Code */ + +/* + * For a given per-AG structure @pag, grab, @execute, and rele all incore + * inodes with the given radix tree @tag. + */ +static int +xfs_inode_walk_ag( + struct xfs_perag *pag, + int iter_flags, + int (*execute)(struct xfs_inode *ip, void *args), + void *args, + int tag) +{ + struct xfs_mount *mp = pag->pag_mount; + uint32_t first_index; + int last_error = 0; + int skipped; + bool done; + int nr_found; + +restart: + done = false; + skipped = 0; + first_index = 0; + nr_found = 0; + do { + struct xfs_inode *batch[XFS_LOOKUP_BATCH]; + int error = 0; + int i; + + rcu_read_lock(); + + if (tag == XFS_ICI_NO_TAG) + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, + (void **)batch, first_index, + XFS_LOOKUP_BATCH); + else + nr_found = radix_tree_gang_lookup_tag( + &pag->pag_ici_root, + (void **) batch, first_index, + XFS_LOOKUP_BATCH, tag); + + if (!nr_found) { + rcu_read_unlock(); + break; + } + + /* + * Grab the inodes before we drop the lock. if we found + * nothing, nr == 0 and the loop will be skipped. + */ + for (i = 0; i < nr_found; i++) { + struct xfs_inode *ip = batch[i]; + + if (done || !xfs_inode_walk_ag_grab(ip, iter_flags)) + batch[i] = NULL; + + /* + * Update the index for the next lookup. Catch + * overflows into the next AG range which can occur if + * we have inodes in the last block of the AG and we + * are currently pointing to the last inode. + * + * Because we may see inodes that are from the wrong AG + * due to RCU freeing and reallocation, only update the + * index if it lies in this AG. It was a race that lead + * us to see this inode, so another lookup from the + * same index will not find it again. + */ + if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) + continue; + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); + if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) + done = true; + } + + /* unlock now we've grabbed the inodes. */ + rcu_read_unlock(); + + for (i = 0; i < nr_found; i++) { + if (!batch[i]) + continue; + if ((iter_flags & XFS_INODE_WALK_INEW_WAIT) && + xfs_iflags_test(batch[i], XFS_INEW)) + xfs_inew_wait(batch[i]); + error = execute(batch[i], args); + xfs_irele(batch[i]); + if (error == -EAGAIN) { + skipped++; + continue; + } + if (error && last_error != -EFSCORRUPTED) + last_error = error; + } + + /* bail out if the filesystem is corrupted. */ + if (error == -EFSCORRUPTED) + break; + + cond_resched(); + + } while (nr_found && !done); + + if (skipped) { + delay(1); + goto restart; + } + return last_error; +} + +/* Fetch the next (possibly tagged) per-AG structure. */ +static inline struct xfs_perag * +xfs_inode_walk_get_perag( + struct xfs_mount *mp, + xfs_agnumber_t agno, + int tag) +{ + if (tag == XFS_ICI_NO_TAG) + return xfs_perag_get(mp, agno); + return xfs_perag_get_tag(mp, agno, tag); +} + +/* + * Call the @execute function on all incore inodes matching the radix tree + * @tag. + */ +static int +xfs_inode_walk( + struct xfs_mount *mp, + int iter_flags, + int (*execute)(struct xfs_inode *ip, void *args), + void *args, + int tag) +{ + struct xfs_perag *pag; + int error = 0; + int last_error = 0; + xfs_agnumber_t agno = 0; + + while ((pag = xfs_inode_walk_get_perag(mp, agno, tag))) { + agno = pag->pag_agno + 1; + error = xfs_inode_walk_ag(pag, iter_flags, execute, args, tag); + xfs_perag_put(pag); + if (error) { + last_error = error; + if (error == -EFSCORRUPTED) + break; + } + } + return last_error; + BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_EOF_FLAGS_VALID); +}
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.13-rc4 commit c1115c0cba2b82e71ec77e794c684ac87160fcf6 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Shorten the prefix so that all the incore inode cache walk code has "xfs_icwalk" in the name somewhere.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 17351c242201..4fa029844eae 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -26,10 +26,10 @@
#include <linux/iversion.h>
-static int xfs_inode_walk(struct xfs_mount *mp, int iter_flags, +static int xfs_icwalk(struct xfs_mount *mp, int iter_flags, int (*execute)(struct xfs_inode *ip, void *args), void *args, int tag); -static int xfs_inode_walk_ag(struct xfs_perag *pag, int iter_flags, +static int xfs_icwalk_ag(struct xfs_perag *pag, int iter_flags, int (*execute)(struct xfs_inode *ip, void *args), void *args, int tag);
@@ -736,7 +736,7 @@ xfs_icache_inode_is_allocated( * lookup reduction and stack usage. This is in the reclaim path, so we can't * be too greedy. * - * XXX: This will be moved closer to xfs_inode_walk* once we get rid of the + * XXX: This will be moved closer to xfs_icwalk* once we get rid of the * separate reclaim walk functions. */ #define XFS_LOOKUP_BATCH 32 @@ -786,7 +786,7 @@ xfs_dqrele_all_inodes( if (qflags & XFS_PQUOTA_ACCT) eofb.eof_flags |= XFS_ICWALK_FLAG_DROP_PDQUOT;
- return xfs_inode_walk(mp, XFS_INODE_WALK_INEW_WAIT, xfs_dqrele_inode, + return xfs_icwalk(mp, XFS_INODE_WALK_INEW_WAIT, xfs_dqrele_inode, &eofb, XFS_ICI_NO_TAG); } #endif /* CONFIG_XFS_QUOTA */ @@ -1537,7 +1537,7 @@ xfs_blockgc_worker(
if (!sb_start_write_trylock(mp->m_super)) return; - error = xfs_inode_walk_ag(pag, 0, xfs_blockgc_scan_inode, NULL, + error = xfs_icwalk_ag(pag, 0, xfs_blockgc_scan_inode, NULL, XFS_ICI_BLOCKGC_TAG); if (error) xfs_info(mp, "AG %u preallocation gc worker failed, err=%d", @@ -1556,7 +1556,7 @@ xfs_blockgc_free_space( { trace_xfs_blockgc_free_space(mp, eofb, _RET_IP_);
- return xfs_inode_walk(mp, 0, xfs_blockgc_scan_inode, eofb, + return xfs_icwalk(mp, 0, xfs_blockgc_scan_inode, eofb, XFS_ICI_BLOCKGC_TAG); }
@@ -1633,7 +1633,7 @@ xfs_blockgc_free_quota( * inodes with the given radix tree @tag. */ static int -xfs_inode_walk_ag( +xfs_icwalk_ag( struct xfs_perag *pag, int iter_flags, int (*execute)(struct xfs_inode *ip, void *args), @@ -1739,7 +1739,7 @@ xfs_inode_walk_ag(
/* Fetch the next (possibly tagged) per-AG structure. */ static inline struct xfs_perag * -xfs_inode_walk_get_perag( +xfs_icwalk_get_perag( struct xfs_mount *mp, xfs_agnumber_t agno, int tag) @@ -1754,7 +1754,7 @@ xfs_inode_walk_get_perag( * @tag. */ static int -xfs_inode_walk( +xfs_icwalk( struct xfs_mount *mp, int iter_flags, int (*execute)(struct xfs_inode *ip, void *args), @@ -1766,9 +1766,9 @@ xfs_inode_walk( int last_error = 0; xfs_agnumber_t agno = 0;
- while ((pag = xfs_inode_walk_get_perag(mp, agno, tag))) { + while ((pag = xfs_icwalk_get_perag(mp, agno, tag))) { agno = pag->pag_agno + 1; - error = xfs_inode_walk_ag(pag, iter_flags, execute, args, tag); + error = xfs_icwalk_ag(pag, iter_flags, execute, args, tag); xfs_perag_put(pag); if (error) { last_error = error;
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.13-rc4 commit c809d7e948a131cba8fdf9fbd0b50e1f59255f50 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
As part of removing the indirect calls and radix tag implementation details from the incore inode walk loop, create an enum to represent the goal of the inode iteration. More immediately, this separate removes the need for the "ICI_NOTAG" define which makes little sense.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 55 +++++++++++++++++++++++++++++++++++---------- fs/xfs/xfs_icache.h | 9 -------- 2 files changed, 43 insertions(+), 21 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 4fa029844eae..1e9491208982 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -26,12 +26,40 @@
#include <linux/iversion.h>
+/* Radix tree tags for incore inode tree. */ + +/* inode is to be reclaimed */ +#define XFS_ICI_RECLAIM_TAG 0 +/* Inode has speculative preallocations (posteof or cow) to clean. */ +#define XFS_ICI_BLOCKGC_TAG 1 + +/* + * The goal for walking incore inodes. These can correspond with incore inode + * radix tree tags when convenient. Avoid existing XFS_IWALK namespace. + */ +enum xfs_icwalk_goal { + /* Goals that are not related to tags; these must be < 0. */ + XFS_ICWALK_DQRELE = -1, + + /* Goals directly associated with tagged inodes. */ + XFS_ICWALK_BLOCKGC = XFS_ICI_BLOCKGC_TAG, +}; + +#define XFS_ICWALK_NULL_TAG (-1U) + +/* Compute the inode radix tree tag for this goal. */ +static inline unsigned int +xfs_icwalk_tag(enum xfs_icwalk_goal goal) +{ + return goal < 0 ? XFS_ICWALK_NULL_TAG : goal; +} + static int xfs_icwalk(struct xfs_mount *mp, int iter_flags, int (*execute)(struct xfs_inode *ip, void *args), - void *args, int tag); + void *args, enum xfs_icwalk_goal goal); static int xfs_icwalk_ag(struct xfs_perag *pag, int iter_flags, int (*execute)(struct xfs_inode *ip, void *args), - void *args, int tag); + void *args, enum xfs_icwalk_goal goal);
/* * Private inode cache walk flags for struct xfs_eofblocks. Must not coincide @@ -787,7 +815,7 @@ xfs_dqrele_all_inodes( eofb.eof_flags |= XFS_ICWALK_FLAG_DROP_PDQUOT;
return xfs_icwalk(mp, XFS_INODE_WALK_INEW_WAIT, xfs_dqrele_inode, - &eofb, XFS_ICI_NO_TAG); + &eofb, XFS_ICWALK_DQRELE); } #endif /* CONFIG_XFS_QUOTA */
@@ -1538,7 +1566,7 @@ xfs_blockgc_worker( if (!sb_start_write_trylock(mp->m_super)) return; error = xfs_icwalk_ag(pag, 0, xfs_blockgc_scan_inode, NULL, - XFS_ICI_BLOCKGC_TAG); + XFS_ICWALK_BLOCKGC); if (error) xfs_info(mp, "AG %u preallocation gc worker failed, err=%d", pag->pag_agno, error); @@ -1557,7 +1585,7 @@ xfs_blockgc_free_space( trace_xfs_blockgc_free_space(mp, eofb, _RET_IP_);
return xfs_icwalk(mp, 0, xfs_blockgc_scan_inode, eofb, - XFS_ICI_BLOCKGC_TAG); + XFS_ICWALK_BLOCKGC); }
/* @@ -1638,7 +1666,7 @@ xfs_icwalk_ag( int iter_flags, int (*execute)(struct xfs_inode *ip, void *args), void *args, - int tag) + enum xfs_icwalk_goal goal) { struct xfs_mount *mp = pag->pag_mount; uint32_t first_index; @@ -1654,12 +1682,13 @@ xfs_icwalk_ag( nr_found = 0; do { struct xfs_inode *batch[XFS_LOOKUP_BATCH]; + unsigned int tag = xfs_icwalk_tag(goal); int error = 0; int i;
rcu_read_lock();
- if (tag == XFS_ICI_NO_TAG) + if (tag == XFS_ICWALK_NULL_TAG) nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void **)batch, first_index, XFS_LOOKUP_BATCH); @@ -1742,9 +1771,11 @@ static inline struct xfs_perag * xfs_icwalk_get_perag( struct xfs_mount *mp, xfs_agnumber_t agno, - int tag) + enum xfs_icwalk_goal goal) { - if (tag == XFS_ICI_NO_TAG) + unsigned int tag = xfs_icwalk_tag(goal); + + if (tag == XFS_ICWALK_NULL_TAG) return xfs_perag_get(mp, agno); return xfs_perag_get_tag(mp, agno, tag); } @@ -1759,16 +1790,16 @@ xfs_icwalk( int iter_flags, int (*execute)(struct xfs_inode *ip, void *args), void *args, - int tag) + enum xfs_icwalk_goal goal) { struct xfs_perag *pag; int error = 0; int last_error = 0; xfs_agnumber_t agno = 0;
- while ((pag = xfs_icwalk_get_perag(mp, agno, tag))) { + while ((pag = xfs_icwalk_get_perag(mp, agno, goal))) { agno = pag->pag_agno + 1; - error = xfs_icwalk_ag(pag, iter_flags, execute, args, tag); + error = xfs_icwalk_ag(pag, iter_flags, execute, args, goal); xfs_perag_put(pag); if (error) { last_error = error; diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index d9baa6df1121..c4274c45d914 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -17,15 +17,6 @@ struct xfs_eofblocks { __u64 eof_min_file_size; };
-/* - * tags for inode radix tree - */ -#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup - in xfs_inode_walk */ -#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */ -/* Inode has speculative preallocations (posteof or cow) to clean. */ -#define XFS_ICI_BLOCKGC_TAG 1 - /* * Flags for xfs_iget() */
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.13-rc4 commit b9baaef42f764db7089a19c82d2b783aef836437 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Disentangle the dqrele_all inode grab code from the "generic" inode walk grabbing code, and and use the opportunity to document why the dqrele grab function does what it does. Since xfs_inode_walk_ag_grab is now only used for blockgc, rename it to reflect that.
Ultimately, there will be four reasons to perform a walk of incore inodes: quotaoff dquote releasing (dqrele), garbage collection of speculative preallocations (blockgc), reclamation of incore inodes (reclaim), and deferred inactivation (inodegc). Each of these four have their own slightly different criteria for deciding if they want to handle an inode, so it makes more sense to have four cohesive igrab functions than one confusing parameteric grab function like we do now.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 71 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 66 insertions(+), 5 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 1e9491208982..c672a9a2c31b 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -770,6 +770,44 @@ xfs_icache_inode_is_allocated( #define XFS_LOOKUP_BATCH 32
#ifdef CONFIG_XFS_QUOTA +/* Decide if we want to grab this inode to drop its dquots. */ +static bool +xfs_dqrele_igrab( + struct xfs_inode *ip) +{ + bool ret = false; + + ASSERT(rcu_read_lock_held()); + + /* Check for stale RCU freed inode */ + spin_lock(&ip->i_flags_lock); + if (!ip->i_ino) + goto out_unlock; + + /* + * Skip inodes that are anywhere in the reclaim machinery because we + * drop dquots before tagging an inode for reclamation. + */ + if (ip->i_flags & (XFS_IRECLAIM | XFS_IRECLAIMABLE)) + goto out_unlock; + + /* + * The inode looks alive; try to grab a VFS reference so that it won't + * get destroyed. If we got the reference, return true to say that + * we grabbed the inode. + * + * If we can't get the reference, then we know the inode had its VFS + * state torn down and hasn't yet entered the reclaim machinery. Since + * we also know that dquots are detached from an inode before it enters + * reclaim, we can skip the inode. + */ + ret = igrab(VFS_I(ip)) != NULL; + +out_unlock: + spin_unlock(&ip->i_flags_lock); + return ret; +} + /* Drop this inode's dquots. */ static int xfs_dqrele_inode( @@ -817,6 +855,8 @@ xfs_dqrele_all_inodes( return xfs_icwalk(mp, XFS_INODE_WALK_INEW_WAIT, xfs_dqrele_inode, &eofb, XFS_ICWALK_DQRELE); } +#else +# define xfs_dqrele_igrab(ip) (false) #endif /* CONFIG_XFS_QUOTA */
/* @@ -1492,12 +1532,12 @@ xfs_blockgc_start( }
/* - * Decide if the given @ip is eligible to be a part of the inode walk, and - * grab it if so. Returns true if it's ready to go or false if we should just - * ignore it. + * Decide if the given @ip is eligible for garbage collection of speculative + * preallocations, and grab it if so. Returns true if it's ready to go or + * false if we should just ignore it. */ static bool -xfs_inode_walk_ag_grab( +xfs_blockgc_igrab( struct xfs_inode *ip, int flags) { @@ -1656,6 +1696,27 @@ xfs_blockgc_free_quota(
/* XFS Inode Cache Walking Code */
+/* + * Decide if we want to grab this inode in anticipation of doing work towards + * the goal. If selected, the VFS must hold a reference to this inode, which + * will be released after processing. + */ +static inline bool +xfs_icwalk_igrab( + enum xfs_icwalk_goal goal, + struct xfs_inode *ip, + int iter_flags) +{ + switch (goal) { + case XFS_ICWALK_DQRELE: + return xfs_dqrele_igrab(ip); + case XFS_ICWALK_BLOCKGC: + return xfs_blockgc_igrab(ip, iter_flags); + default: + return false; + } +} + /* * For a given per-AG structure @pag, grab, @execute, and rele all incore * inodes with the given radix tree @tag. @@ -1710,7 +1771,7 @@ xfs_icwalk_ag( for (i = 0; i < nr_found; i++) { struct xfs_inode *ip = batch[i];
- if (done || !xfs_inode_walk_ag_grab(ip, iter_flags)) + if (done || !xfs_icwalk_igrab(goal, ip, iter_flags)) batch[i] = NULL;
/*
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.13-rc4 commit 9d2793ceecb9fd711f70a860685b71129cac5dc9 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Move the INEW wait into xfs_dqrele_inode so that we can drop the iter_flags parameter in the next patch.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index c672a9a2c31b..0822f11da48b 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -816,6 +816,9 @@ xfs_dqrele_inode( { struct xfs_eofblocks *eofb = priv;
+ if (xfs_iflags_test(ip, XFS_INEW)) + xfs_inew_wait(ip); + xfs_ilock(ip, XFS_ILOCK_EXCL); if (eofb->eof_flags & XFS_ICWALK_FLAG_DROP_UDQUOT) { xfs_qm_dqrele(ip->i_udquot); @@ -852,8 +855,7 @@ xfs_dqrele_all_inodes( if (qflags & XFS_PQUOTA_ACCT) eofb.eof_flags |= XFS_ICWALK_FLAG_DROP_PDQUOT;
- return xfs_icwalk(mp, XFS_INODE_WALK_INEW_WAIT, xfs_dqrele_inode, - &eofb, XFS_ICWALK_DQRELE); + return xfs_icwalk(mp, 0, xfs_dqrele_inode, &eofb, XFS_ICWALK_DQRELE); } #else # define xfs_dqrele_igrab(ip) (false)
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.13-rc4 commit 7fdff52623b4df9c9ae665fe8bb727978c29414e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
The sole iter_flags is XFS_INODE_WALK_INEW_WAIT, and there are no users. Remove the flag, and the parameter, and all the code that used it.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 33 ++++++++++++--------------------- fs/xfs/xfs_icache.h | 5 ----- 2 files changed, 12 insertions(+), 26 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 0822f11da48b..fbc65e41fac4 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -54,10 +54,10 @@ xfs_icwalk_tag(enum xfs_icwalk_goal goal) return goal < 0 ? XFS_ICWALK_NULL_TAG : goal; }
-static int xfs_icwalk(struct xfs_mount *mp, int iter_flags, +static int xfs_icwalk(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, void *args), void *args, enum xfs_icwalk_goal goal); -static int xfs_icwalk_ag(struct xfs_perag *pag, int iter_flags, +static int xfs_icwalk_ag(struct xfs_perag *pag, int (*execute)(struct xfs_inode *ip, void *args), void *args, enum xfs_icwalk_goal goal);
@@ -281,7 +281,7 @@ xfs_inode_clear_reclaim_tag( xfs_perag_clear_reclaim_tag(pag); }
-static void +static inline void xfs_inew_wait( struct xfs_inode *ip) { @@ -855,7 +855,7 @@ xfs_dqrele_all_inodes( if (qflags & XFS_PQUOTA_ACCT) eofb.eof_flags |= XFS_ICWALK_FLAG_DROP_PDQUOT;
- return xfs_icwalk(mp, 0, xfs_dqrele_inode, &eofb, XFS_ICWALK_DQRELE); + return xfs_icwalk(mp, xfs_dqrele_inode, &eofb, XFS_ICWALK_DQRELE); } #else # define xfs_dqrele_igrab(ip) (false) @@ -1540,11 +1540,9 @@ xfs_blockgc_start( */ static bool xfs_blockgc_igrab( - struct xfs_inode *ip, - int flags) + struct xfs_inode *ip) { struct inode *inode = VFS_I(ip); - bool newinos = !!(flags & XFS_INODE_WALK_INEW_WAIT);
ASSERT(rcu_read_lock_held());
@@ -1554,8 +1552,7 @@ xfs_blockgc_igrab( goto out_unlock_noent;
/* avoid new or reclaimable inodes. Leave for reclaim code to flush */ - if ((!newinos && __xfs_iflags_test(ip, XFS_INEW)) || - __xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM)) + if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) goto out_unlock_noent; spin_unlock(&ip->i_flags_lock);
@@ -1607,7 +1604,7 @@ xfs_blockgc_worker(
if (!sb_start_write_trylock(mp->m_super)) return; - error = xfs_icwalk_ag(pag, 0, xfs_blockgc_scan_inode, NULL, + error = xfs_icwalk_ag(pag, xfs_blockgc_scan_inode, NULL, XFS_ICWALK_BLOCKGC); if (error) xfs_info(mp, "AG %u preallocation gc worker failed, err=%d", @@ -1626,7 +1623,7 @@ xfs_blockgc_free_space( { trace_xfs_blockgc_free_space(mp, eofb, _RET_IP_);
- return xfs_icwalk(mp, 0, xfs_blockgc_scan_inode, eofb, + return xfs_icwalk(mp, xfs_blockgc_scan_inode, eofb, XFS_ICWALK_BLOCKGC); }
@@ -1706,14 +1703,13 @@ xfs_blockgc_free_quota( static inline bool xfs_icwalk_igrab( enum xfs_icwalk_goal goal, - struct xfs_inode *ip, - int iter_flags) + struct xfs_inode *ip) { switch (goal) { case XFS_ICWALK_DQRELE: return xfs_dqrele_igrab(ip); case XFS_ICWALK_BLOCKGC: - return xfs_blockgc_igrab(ip, iter_flags); + return xfs_blockgc_igrab(ip); default: return false; } @@ -1726,7 +1722,6 @@ xfs_icwalk_igrab( static int xfs_icwalk_ag( struct xfs_perag *pag, - int iter_flags, int (*execute)(struct xfs_inode *ip, void *args), void *args, enum xfs_icwalk_goal goal) @@ -1773,7 +1768,7 @@ xfs_icwalk_ag( for (i = 0; i < nr_found; i++) { struct xfs_inode *ip = batch[i];
- if (done || !xfs_icwalk_igrab(goal, ip, iter_flags)) + if (done || !xfs_icwalk_igrab(goal, ip)) batch[i] = NULL;
/* @@ -1801,9 +1796,6 @@ xfs_icwalk_ag( for (i = 0; i < nr_found; i++) { if (!batch[i]) continue; - if ((iter_flags & XFS_INODE_WALK_INEW_WAIT) && - xfs_iflags_test(batch[i], XFS_INEW)) - xfs_inew_wait(batch[i]); error = execute(batch[i], args); xfs_irele(batch[i]); if (error == -EAGAIN) { @@ -1850,7 +1842,6 @@ xfs_icwalk_get_perag( static int xfs_icwalk( struct xfs_mount *mp, - int iter_flags, int (*execute)(struct xfs_inode *ip, void *args), void *args, enum xfs_icwalk_goal goal) @@ -1862,7 +1853,7 @@ xfs_icwalk(
while ((pag = xfs_icwalk_get_perag(mp, agno, goal))) { agno = pag->pag_agno + 1; - error = xfs_icwalk_ag(pag, iter_flags, execute, args, goal); + error = xfs_icwalk_ag(pag, execute, args, goal); xfs_perag_put(pag); if (error) { last_error = error; diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index c4274c45d914..3ec00f1fea86 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -25,11 +25,6 @@ struct xfs_eofblocks { #define XFS_IGET_DONTCACHE 0x4 #define XFS_IGET_INCORE 0x8 /* don't read from disk or reinit */
-/* - * flags for AG inode iterator - */ -#define XFS_INODE_WALK_INEW_WAIT 0x1 /* wait on new inodes */ - int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, uint flags, uint lock_flags, xfs_inode_t **ipp);
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.13-rc4 commit f427cf5c6236acdf72b4d8564b2e18937c4cc8d8 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
It turns out that there is a 1:1 mapping between the execute and goal parameters that are passed to xfs_inode_walk_ag:
xfs_blockgc_scan_inode <=> XFS_ICWALK_BLOCKGC xfs_dqrele_inode <=> XFS_ICWALK_DQRELE
Because of this exact correspondence, we don't need the execute function pointer and can replace it with a direct call.
For the price of a forward static declaration, we can eliminate the indirect function call. This likely has a negligible impact on performance (since the execute function runs transactions), but it also simplifies the function signature.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 60 +++++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 24 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index fbc65e41fac4..e47c6fa89106 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -55,11 +55,9 @@ xfs_icwalk_tag(enum xfs_icwalk_goal goal) }
static int xfs_icwalk(struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, void *args), - void *args, enum xfs_icwalk_goal goal); + enum xfs_icwalk_goal goal, void *args); static int xfs_icwalk_ag(struct xfs_perag *pag, - int (*execute)(struct xfs_inode *ip, void *args), - void *args, enum xfs_icwalk_goal goal); + enum xfs_icwalk_goal goal, void *args);
/* * Private inode cache walk flags for struct xfs_eofblocks. Must not coincide @@ -855,10 +853,11 @@ xfs_dqrele_all_inodes( if (qflags & XFS_PQUOTA_ACCT) eofb.eof_flags |= XFS_ICWALK_FLAG_DROP_PDQUOT;
- return xfs_icwalk(mp, xfs_dqrele_inode, &eofb, XFS_ICWALK_DQRELE); + return xfs_icwalk(mp, XFS_ICWALK_DQRELE, &eofb); } #else # define xfs_dqrele_igrab(ip) (false) +# define xfs_dqrele_inode(ip, priv) (0) #endif /* CONFIG_XFS_QUOTA */
/* @@ -1604,8 +1603,7 @@ xfs_blockgc_worker(
if (!sb_start_write_trylock(mp->m_super)) return; - error = xfs_icwalk_ag(pag, xfs_blockgc_scan_inode, NULL, - XFS_ICWALK_BLOCKGC); + error = xfs_icwalk_ag(pag, XFS_ICWALK_BLOCKGC, NULL); if (error) xfs_info(mp, "AG %u preallocation gc worker failed, err=%d", pag->pag_agno, error); @@ -1623,8 +1621,7 @@ xfs_blockgc_free_space( { trace_xfs_blockgc_free_space(mp, eofb, _RET_IP_);
- return xfs_icwalk(mp, xfs_blockgc_scan_inode, eofb, - XFS_ICWALK_BLOCKGC); + return xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, eofb); }
/* @@ -1715,16 +1712,36 @@ xfs_icwalk_igrab( } }
+/* Process an inode and release it. Return -EAGAIN to skip an inode. */ +static inline int +xfs_icwalk_process_inode( + enum xfs_icwalk_goal goal, + struct xfs_inode *ip, + void *args) +{ + int error; + + switch (goal) { + case XFS_ICWALK_DQRELE: + error = xfs_dqrele_inode(ip, args); + break; + case XFS_ICWALK_BLOCKGC: + error = xfs_blockgc_scan_inode(ip, args); + break; + } + xfs_irele(ip); + return error; +} + /* - * For a given per-AG structure @pag, grab, @execute, and rele all incore - * inodes with the given radix tree @tag. + * For a given per-AG structure @pag and a goal, grab qualifying inodes and + * process them in some manner. */ static int xfs_icwalk_ag( struct xfs_perag *pag, - int (*execute)(struct xfs_inode *ip, void *args), - void *args, - enum xfs_icwalk_goal goal) + enum xfs_icwalk_goal goal, + void *args) { struct xfs_mount *mp = pag->pag_mount; uint32_t first_index; @@ -1796,8 +1813,7 @@ xfs_icwalk_ag( for (i = 0; i < nr_found; i++) { if (!batch[i]) continue; - error = execute(batch[i], args); - xfs_irele(batch[i]); + error = xfs_icwalk_process_inode(goal, batch[i], args); if (error == -EAGAIN) { skipped++; continue; @@ -1835,16 +1851,12 @@ xfs_icwalk_get_perag( return xfs_perag_get_tag(mp, agno, tag); }
-/* - * Call the @execute function on all incore inodes matching the radix tree - * @tag. - */ +/* Walk all incore inodes to achieve a given goal. */ static int xfs_icwalk( struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, void *args), - void *args, - enum xfs_icwalk_goal goal) + enum xfs_icwalk_goal goal, + void *args) { struct xfs_perag *pag; int error = 0; @@ -1853,7 +1865,7 @@ xfs_icwalk(
while ((pag = xfs_icwalk_get_perag(mp, agno, goal))) { agno = pag->pag_agno + 1; - error = xfs_icwalk_ag(pag, execute, args, goal); + error = xfs_icwalk_ag(pag, goal, args); xfs_perag_put(pag); if (error) { last_error = error;
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.13-rc4 commit d20d5edcf941e70e03cdbda2f8df93e3969c31a2 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Clean up the definition of which inode states are not eligible for speculative preallocation garbage collecting by creating a private the set of flags-to-ignore, so we want the definition not to end up a cluttered mess.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index e47c6fa89106..33d5be8a7cb2 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1532,6 +1532,10 @@ xfs_blockgc_start( xfs_blockgc_queue(pag); }
+/* Don't try to run block gc on an inode that's in any of these states. */ +#define XFS_BLOCKGC_NOGRAB_IFLAGS (XFS_INEW | \ + XFS_IRECLAIMABLE | \ + XFS_IRECLAIM) /* * Decide if the given @ip is eligible for garbage collection of speculative * preallocations, and grab it if so. Returns true if it's ready to go or @@ -1550,8 +1554,7 @@ xfs_blockgc_igrab( if (!ip->i_ino) goto out_unlock_noent;
- /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ - if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) + if (ip->i_flags & XFS_BLOCKGC_NOGRAB_IFLAGS) goto out_unlock_noent; spin_unlock(&ip->i_flags_lock);
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.13-rc4 commit 594ab00b760f1722b800c45d37adc21eecf42dc1 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Soon we're going to be adding two new callers to the incore inode walk code: reclaim of incore inodes, and (later) inactivation of inodes. Both states operate on inodes that no longer have any VFS state, so we need to move the xfs_irele calls into the processing functions.
In other words, icwalk processing functions are responsible for cleaning up whatever state changes are made by the corresponding icwalk igrab function that picked the inode for processing.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 33d5be8a7cb2..9041773e90a8 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -807,7 +807,7 @@ xfs_dqrele_igrab( }
/* Drop this inode's dquots. */ -static int +static void xfs_dqrele_inode( struct xfs_inode *ip, void *priv) @@ -831,7 +831,7 @@ xfs_dqrele_inode( ip->i_pdquot = NULL; } xfs_iunlock(ip, XFS_ILOCK_EXCL); - return 0; + xfs_irele(ip); }
/* @@ -857,7 +857,7 @@ xfs_dqrele_all_inodes( } #else # define xfs_dqrele_igrab(ip) (false) -# define xfs_dqrele_inode(ip, priv) (0) +# define xfs_dqrele_inode(ip, priv) ((void)0) #endif /* CONFIG_XFS_QUOTA */
/* @@ -1591,6 +1591,7 @@ xfs_blockgc_scan_inode( unlock: if (lockflags) xfs_iunlock(ip, lockflags); + xfs_irele(ip); return error; }
@@ -1697,8 +1698,7 @@ xfs_blockgc_free_quota(
/* * Decide if we want to grab this inode in anticipation of doing work towards - * the goal. If selected, the VFS must hold a reference to this inode, which - * will be released after processing. + * the goal. */ static inline bool xfs_icwalk_igrab( @@ -1715,24 +1715,26 @@ xfs_icwalk_igrab( } }
-/* Process an inode and release it. Return -EAGAIN to skip an inode. */ +/* + * Process an inode. Each processing function must handle any state changes + * made by the icwalk igrab function. Return -EAGAIN to skip an inode. + */ static inline int xfs_icwalk_process_inode( enum xfs_icwalk_goal goal, struct xfs_inode *ip, void *args) { - int error; + int error = 0;
switch (goal) { case XFS_ICWALK_DQRELE: - error = xfs_dqrele_inode(ip, args); + xfs_dqrele_inode(ip, args); break; case XFS_ICWALK_BLOCKGC: error = xfs_blockgc_scan_inode(ip, args); break; } - xfs_irele(ip); return error; }
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.13-rc4 commit 919a4ddb68413056ecb7c71d9d5465bb54c8032b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Radix tree tags are supposed to be unsigned ints, so fix the callers.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/libxfs/xfs_sb.c | 2 +- fs/xfs/libxfs/xfs_sb.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 66e8353da2f3..13ef340044b3 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -61,7 +61,7 @@ struct xfs_perag * xfs_perag_get_tag( struct xfs_mount *mp, xfs_agnumber_t first, - int tag) + unsigned int tag) { struct xfs_perag *pag; int found; diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h index 92465a9a5162..bd56ab5d65f4 100644 --- a/fs/xfs/libxfs/xfs_sb.h +++ b/fs/xfs/libxfs/xfs_sb.h @@ -17,8 +17,8 @@ struct xfs_perag; * perag get/put wrappers for ref counting */ extern struct xfs_perag *xfs_perag_get(struct xfs_mount *, xfs_agnumber_t); -extern struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t, - int tag); +struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno, + unsigned int tag); extern void xfs_perag_put(struct xfs_perag *pag); extern int xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t);
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.13-rc4 commit 9d5ee837595134f91bb2d66f571f498c3b8ab148 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Pass a pointer to the actual eofb structure around the inode scanner functions instead of a void pointer, now that none of the functions is used as a callback.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 9041773e90a8..c27b1ef18372 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -55,9 +55,9 @@ xfs_icwalk_tag(enum xfs_icwalk_goal goal) }
static int xfs_icwalk(struct xfs_mount *mp, - enum xfs_icwalk_goal goal, void *args); + enum xfs_icwalk_goal goal, struct xfs_eofblocks *eofb); static int xfs_icwalk_ag(struct xfs_perag *pag, - enum xfs_icwalk_goal goal, void *args); + enum xfs_icwalk_goal goal, struct xfs_eofblocks *eofb);
/* * Private inode cache walk flags for struct xfs_eofblocks. Must not coincide @@ -810,10 +810,8 @@ xfs_dqrele_igrab( static void xfs_dqrele_inode( struct xfs_inode *ip, - void *priv) + struct xfs_eofblocks *eofb) { - struct xfs_eofblocks *eofb = priv; - if (xfs_iflags_test(ip, XFS_INEW)) xfs_inew_wait(ip);
@@ -1230,10 +1228,9 @@ xfs_reclaim_worker( STATIC int xfs_inode_free_eofblocks( struct xfs_inode *ip, - void *args, + struct xfs_eofblocks *eofb, unsigned int *lockflags) { - struct xfs_eofblocks *eofb = args; bool wait;
wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC); @@ -1438,10 +1435,9 @@ xfs_prep_free_cowblocks( STATIC int xfs_inode_free_cowblocks( struct xfs_inode *ip, - void *args, + struct xfs_eofblocks *eofb, unsigned int *lockflags) { - struct xfs_eofblocks *eofb = args; bool wait; int ret = 0;
@@ -1578,16 +1574,16 @@ xfs_blockgc_igrab( static int xfs_blockgc_scan_inode( struct xfs_inode *ip, - void *args) + struct xfs_eofblocks *eofb) { unsigned int lockflags = 0; int error;
- error = xfs_inode_free_eofblocks(ip, args, &lockflags); + error = xfs_inode_free_eofblocks(ip, eofb, &lockflags); if (error) goto unlock;
- error = xfs_inode_free_cowblocks(ip, args, &lockflags); + error = xfs_inode_free_cowblocks(ip, eofb, &lockflags); unlock: if (lockflags) xfs_iunlock(ip, lockflags); @@ -1723,16 +1719,16 @@ static inline int xfs_icwalk_process_inode( enum xfs_icwalk_goal goal, struct xfs_inode *ip, - void *args) + struct xfs_eofblocks *eofb) { int error = 0;
switch (goal) { case XFS_ICWALK_DQRELE: - xfs_dqrele_inode(ip, args); + xfs_dqrele_inode(ip, eofb); break; case XFS_ICWALK_BLOCKGC: - error = xfs_blockgc_scan_inode(ip, args); + error = xfs_blockgc_scan_inode(ip, eofb); break; } return error; @@ -1746,7 +1742,7 @@ static int xfs_icwalk_ag( struct xfs_perag *pag, enum xfs_icwalk_goal goal, - void *args) + struct xfs_eofblocks *eofb) { struct xfs_mount *mp = pag->pag_mount; uint32_t first_index; @@ -1818,7 +1814,7 @@ xfs_icwalk_ag( for (i = 0; i < nr_found; i++) { if (!batch[i]) continue; - error = xfs_icwalk_process_inode(goal, batch[i], args); + error = xfs_icwalk_process_inode(goal, batch[i], eofb); if (error == -EAGAIN) { skipped++; continue; @@ -1861,7 +1857,7 @@ static int xfs_icwalk( struct xfs_mount *mp, enum xfs_icwalk_goal goal, - void *args) + struct xfs_eofblocks *eofb) { struct xfs_perag *pag; int error = 0; @@ -1870,7 +1866,7 @@ xfs_icwalk(
while ((pag = xfs_icwalk_get_perag(mp, agno, goal))) { agno = pag->pag_agno + 1; - error = xfs_icwalk_ag(pag, goal, args); + error = xfs_icwalk_ag(pag, goal, eofb); xfs_perag_put(pag); if (error) { last_error = error;
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.13-rc4 commit f1bc5c5630f90b83b339e8970dcf6d03abba5bd5 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Merge these two inode walk loops together, since they're pretty similar now.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 162 +++++++++++++------------------------------- fs/xfs/xfs_icache.h | 1 + fs/xfs/xfs_trace.h | 5 +- 3 files changed, 53 insertions(+), 115 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index c27b1ef18372..d15087b89b33 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -43,6 +43,7 @@ enum xfs_icwalk_goal {
/* Goals directly associated with tagged inodes. */ XFS_ICWALK_BLOCKGC = XFS_ICI_BLOCKGC_TAG, + XFS_ICWALK_RECLAIM = XFS_ICI_RECLAIM_TAG, };
#define XFS_ICWALK_NULL_TAG (-1U) @@ -67,9 +68,13 @@ static int xfs_icwalk_ag(struct xfs_perag *pag, #define XFS_ICWALK_FLAG_DROP_GDQUOT (1U << 30) #define XFS_ICWALK_FLAG_DROP_PDQUOT (1U << 29)
+/* Stop scanning after icw_scan_limit inodes. */ +#define XFS_ICWALK_FLAG_SCAN_LIMIT (1U << 28) + #define XFS_ICWALK_PRIVATE_FLAGS (XFS_ICWALK_FLAG_DROP_UDQUOT | \ XFS_ICWALK_FLAG_DROP_GDQUOT | \ - XFS_ICWALK_FLAG_DROP_PDQUOT) + XFS_ICWALK_FLAG_DROP_PDQUOT | \ + XFS_ICWALK_FLAG_SCAN_LIMIT)
/* * Allocate and initialise an xfs_inode. @@ -756,17 +761,6 @@ xfs_icache_inode_is_allocated( return 0; }
-/* - * The inode lookup is done in batches to keep the amount of lock traffic and - * radix tree lookups to a minimum. The batch size is a trade off between - * lookup reduction and stack usage. This is in the reclaim path, so we can't - * be too greedy. - * - * XXX: This will be moved closer to xfs_icwalk* once we get rid of the - * separate reclaim walk functions. - */ -#define XFS_LOOKUP_BATCH 32 - #ifdef CONFIG_XFS_QUOTA /* Decide if we want to grab this inode to drop its dquots. */ static bool @@ -876,7 +870,7 @@ xfs_dqrele_all_inodes( * Return true if we grabbed it, false otherwise. */ static bool -xfs_reclaim_inode_grab( +xfs_reclaim_igrab( struct xfs_inode *ip) { ASSERT(rcu_read_lock_held()); @@ -988,108 +982,13 @@ xfs_reclaim_inode( xfs_iflags_clear(ip, XFS_IRECLAIM); }
-/* - * Walk the AGs and reclaim the inodes in them. Even if the filesystem is - * corrupted, we still want to try to reclaim all the inodes. If we don't, - * then a shut down during filesystem unmount reclaim walk leak all the - * unreclaimed inodes. - * - * Returns non-zero if any AGs or inodes were skipped in the reclaim pass - * so that callers that want to block until all dirty inodes are written back - * and reclaimed can sanely loop. - */ -static void -xfs_reclaim_inodes_ag( - struct xfs_mount *mp, - int *nr_to_scan) -{ - struct xfs_perag *pag; - xfs_agnumber_t ag = 0; - - while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { - unsigned long first_index = 0; - int done = 0; - int nr_found = 0; - - ag = pag->pag_agno + 1; - - first_index = READ_ONCE(pag->pag_ici_reclaim_cursor); - do { - struct xfs_inode *batch[XFS_LOOKUP_BATCH]; - int i; - - rcu_read_lock(); - nr_found = radix_tree_gang_lookup_tag( - &pag->pag_ici_root, - (void **)batch, first_index, - XFS_LOOKUP_BATCH, - XFS_ICI_RECLAIM_TAG); - if (!nr_found) { - done = 1; - rcu_read_unlock(); - break; - } - - /* - * Grab the inodes before we drop the lock. if we found - * nothing, nr == 0 and the loop will be skipped. - */ - for (i = 0; i < nr_found; i++) { - struct xfs_inode *ip = batch[i]; - - if (done || !xfs_reclaim_inode_grab(ip)) - batch[i] = NULL; - - /* - * Update the index for the next lookup. Catch - * overflows into the next AG range which can - * occur if we have inodes in the last block of - * the AG and we are currently pointing to the - * last inode. - * - * Because we may see inodes that are from the - * wrong AG due to RCU freeing and - * reallocation, only update the index if it - * lies in this AG. It was a race that lead us - * to see this inode, so another lookup from - * the same index will not find it again. - */ - if (XFS_INO_TO_AGNO(mp, ip->i_ino) != - pag->pag_agno) - continue; - first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); - if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) - done = 1; - } - - /* unlock now we've grabbed the inodes. */ - rcu_read_unlock(); - - for (i = 0; i < nr_found; i++) { - if (batch[i]) - xfs_reclaim_inode(batch[i], pag); - } - - *nr_to_scan -= XFS_LOOKUP_BATCH; - cond_resched(); - } while (nr_found && !done && *nr_to_scan > 0); - - if (done) - first_index = 0; - WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index); - xfs_perag_put(pag); - } -} - void xfs_reclaim_inodes( struct xfs_mount *mp) { - int nr_to_scan = INT_MAX; - while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { xfs_ail_push_all_sync(mp->m_ail); - xfs_reclaim_inodes_ag(mp, &nr_to_scan); + xfs_icwalk(mp, XFS_ICWALK_RECLAIM, NULL); } }
@@ -1105,11 +1004,16 @@ xfs_reclaim_inodes_nr( struct xfs_mount *mp, int nr_to_scan) { + struct xfs_eofblocks eofb = { + .eof_flags = XFS_ICWALK_FLAG_SCAN_LIMIT, + .icw_scan_limit = nr_to_scan, + }; + /* kick background reclaimer and push the AIL */ xfs_reclaim_work_queue(mp); xfs_ail_push_all(mp->m_ail);
- xfs_reclaim_inodes_ag(mp, &nr_to_scan); + xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &eofb); return 0; }
@@ -1219,9 +1123,8 @@ xfs_reclaim_worker( { struct xfs_mount *mp = container_of(to_delayed_work(work), struct xfs_mount, m_reclaim_work); - int nr_to_scan = INT_MAX;
- xfs_reclaim_inodes_ag(mp, &nr_to_scan); + xfs_icwalk(mp, XFS_ICWALK_RECLAIM, NULL); xfs_reclaim_work_queue(mp); }
@@ -1692,6 +1595,15 @@ xfs_blockgc_free_quota(
/* XFS Inode Cache Walking Code */
+/* + * The inode lookup is done in batches to keep the amount of lock traffic and + * radix tree lookups to a minimum. The batch size is a trade off between + * lookup reduction and stack usage. This is in the reclaim path, so we can't + * be too greedy. + */ +#define XFS_LOOKUP_BATCH 32 + + /* * Decide if we want to grab this inode in anticipation of doing work towards * the goal. @@ -1706,6 +1618,8 @@ xfs_icwalk_igrab( return xfs_dqrele_igrab(ip); case XFS_ICWALK_BLOCKGC: return xfs_blockgc_igrab(ip); + case XFS_ICWALK_RECLAIM: + return xfs_reclaim_igrab(ip); default: return false; } @@ -1719,6 +1633,7 @@ static inline int xfs_icwalk_process_inode( enum xfs_icwalk_goal goal, struct xfs_inode *ip, + struct xfs_perag *pag, struct xfs_eofblocks *eofb) { int error = 0; @@ -1730,6 +1645,9 @@ xfs_icwalk_process_inode( case XFS_ICWALK_BLOCKGC: error = xfs_blockgc_scan_inode(ip, eofb); break; + case XFS_ICWALK_RECLAIM: + xfs_reclaim_inode(ip, pag); + break; } return error; } @@ -1754,7 +1672,10 @@ xfs_icwalk_ag( restart: done = false; skipped = 0; - first_index = 0; + if (goal == XFS_ICWALK_RECLAIM) + first_index = READ_ONCE(pag->pag_ici_reclaim_cursor); + else + first_index = 0; nr_found = 0; do { struct xfs_inode *batch[XFS_LOOKUP_BATCH]; @@ -1775,6 +1696,7 @@ xfs_icwalk_ag( XFS_LOOKUP_BATCH, tag);
if (!nr_found) { + done = true; rcu_read_unlock(); break; } @@ -1814,7 +1736,8 @@ xfs_icwalk_ag( for (i = 0; i < nr_found; i++) { if (!batch[i]) continue; - error = xfs_icwalk_process_inode(goal, batch[i], eofb); + error = xfs_icwalk_process_inode(goal, batch[i], pag, + eofb); if (error == -EAGAIN) { skipped++; continue; @@ -1829,8 +1752,19 @@ xfs_icwalk_ag(
cond_resched();
+ if (eofb && (eofb->eof_flags & XFS_ICWALK_FLAG_SCAN_LIMIT)) { + eofb->icw_scan_limit -= XFS_LOOKUP_BATCH; + if (eofb->icw_scan_limit <= 0) + break; + } } while (nr_found && !done);
+ if (goal == XFS_ICWALK_RECLAIM) { + if (done) + first_index = 0; + WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index); + } + if (skipped) { delay(1); goto restart; diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 3ec00f1fea86..b6ab1067c52b 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -15,6 +15,7 @@ struct xfs_eofblocks { kgid_t eof_gid; prid_t eof_prid; __u64 eof_min_file_size; + int icw_scan_limit; };
/* diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index ab761b160e2c..0fb82dbc4352 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3878,6 +3878,7 @@ DECLARE_EVENT_CLASS(xfs_eofblocks_class, __field(uint32_t, gid) __field(prid_t, prid) __field(__u64, min_file_size) + __field(int, scan_limit) __field(unsigned long, caller_ip) ), TP_fast_assign( @@ -3889,15 +3890,17 @@ DECLARE_EVENT_CLASS(xfs_eofblocks_class, eofb->eof_gid) : 0; __entry->prid = eofb ? eofb->eof_prid : 0; __entry->min_file_size = eofb ? eofb->eof_min_file_size : 0; + __entry->scan_limit = eofb ? eofb->icw_scan_limit : 0; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d flags 0x%x uid %u gid %u prid %u minsize %llu caller %pS", + TP_printk("dev %d:%d flags 0x%x uid %u gid %u prid %u minsize %llu scan_limit %d caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->flags, __entry->uid, __entry->gid, __entry->prid, __entry->min_file_size, + __entry->scan_limit, (char *)__entry->caller_ip) ); #define DEFINE_EOFBLOCKS_EVENT(name) \
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.13-rc4 commit c076ae7a9361b87624900c722012a837fee0b1b3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
In preparation for adding another incore inode tree tag, refactor the code that sets and clears tags from the per-AG inode tree and the tree of per-AG structures, and remove the open-coded versions used by the blockgc code.
Note: For reclaim, we now rely on the radix tree tags instead of the reclaimable inode count more heavily than we used to. The conversion should be fine, but the logic isn't 100% identical.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 158 +++++++++++++++++++++----------------------- fs/xfs/xfs_icache.h | 2 +- fs/xfs/xfs_super.c | 2 +- fs/xfs/xfs_trace.h | 6 +- 4 files changed, 80 insertions(+), 88 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index d15087b89b33..7916627c8613 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -206,46 +206,94 @@ xfs_reclaim_work_queue( rcu_read_unlock(); }
-static void -xfs_perag_set_reclaim_tag( +/* + * Background scanning to trim preallocated space. This is queued based on the + * 'speculative_prealloc_lifetime' tunable (5m by default). + */ +static inline void +xfs_blockgc_queue( struct xfs_perag *pag) +{ + rcu_read_lock(); + if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) + queue_delayed_work(pag->pag_mount->m_gc_workqueue, + &pag->pag_blockgc_work, + msecs_to_jiffies(xfs_blockgc_secs * 1000)); + rcu_read_unlock(); +} + +/* Set a tag on both the AG incore inode tree and the AG radix tree. */ +static void +xfs_perag_set_inode_tag( + struct xfs_perag *pag, + xfs_agino_t agino, + unsigned int tag) { struct xfs_mount *mp = pag->pag_mount; + bool was_tagged;
lockdep_assert_held(&pag->pag_ici_lock); - if (pag->pag_ici_reclaimable++) + + was_tagged = radix_tree_tagged(&pag->pag_ici_root, tag); + radix_tree_tag_set(&pag->pag_ici_root, agino, tag); + + if (tag == XFS_ICI_RECLAIM_TAG) + pag->pag_ici_reclaimable++; + + if (was_tagged) return;
- /* propagate the reclaim tag up into the perag radix tree */ + /* propagate the tag up into the perag radix tree */ spin_lock(&mp->m_perag_lock); - radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, - XFS_ICI_RECLAIM_TAG); + radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, tag); spin_unlock(&mp->m_perag_lock);
- /* schedule periodic background inode reclaim */ - xfs_reclaim_work_queue(mp); + /* start background work */ + switch (tag) { + case XFS_ICI_RECLAIM_TAG: + xfs_reclaim_work_queue(mp); + break; + case XFS_ICI_BLOCKGC_TAG: + xfs_blockgc_queue(pag); + break; + }
- trace_xfs_perag_set_reclaim(mp, pag->pag_agno, -1, _RET_IP_); + trace_xfs_perag_set_inode_tag(mp, pag->pag_agno, tag, _RET_IP_); }
+/* Clear a tag on both the AG incore inode tree and the AG radix tree. */ static void -xfs_perag_clear_reclaim_tag( - struct xfs_perag *pag) +xfs_perag_clear_inode_tag( + struct xfs_perag *pag, + xfs_agino_t agino, + unsigned int tag) { struct xfs_mount *mp = pag->pag_mount;
lockdep_assert_held(&pag->pag_ici_lock); - if (--pag->pag_ici_reclaimable) + + /* + * Reclaim can signal (with a null agino) that it cleared its own tag + * by removing the inode from the radix tree. + */ + if (agino != NULLAGINO) + radix_tree_tag_clear(&pag->pag_ici_root, agino, tag); + else + ASSERT(tag == XFS_ICI_RECLAIM_TAG); + + if (tag == XFS_ICI_RECLAIM_TAG) + pag->pag_ici_reclaimable--; + + if (radix_tree_tagged(&pag->pag_ici_root, tag)) return;
- /* clear the reclaim tag from the perag radix tree */ + /* clear the tag from the perag radix tree */ spin_lock(&mp->m_perag_lock); - radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, - XFS_ICI_RECLAIM_TAG); + radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, tag); spin_unlock(&mp->m_perag_lock); - trace_xfs_perag_clear_reclaim(mp, pag->pag_agno, -1, _RET_IP_); -}
+ trace_xfs_perag_clear_inode_tag(mp, pag->pag_agno, tag, _RET_IP_); +}
/* * We set the inode flag atomically with the radix tree tag. @@ -253,7 +301,7 @@ xfs_perag_clear_reclaim_tag( * can go away. */ void -xfs_inode_set_reclaim_tag( +xfs_inode_mark_reclaimable( struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; @@ -263,9 +311,8 @@ xfs_inode_set_reclaim_tag( spin_lock(&pag->pag_ici_lock); spin_lock(&ip->i_flags_lock);
- radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino), - XFS_ICI_RECLAIM_TAG); - xfs_perag_set_reclaim_tag(pag); + xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), + XFS_ICI_RECLAIM_TAG); __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
spin_unlock(&ip->i_flags_lock); @@ -273,17 +320,6 @@ xfs_inode_set_reclaim_tag( xfs_perag_put(pag); }
-STATIC void -xfs_inode_clear_reclaim_tag( - struct xfs_perag *pag, - xfs_ino_t ino) -{ - radix_tree_tag_clear(&pag->pag_ici_root, - XFS_INO_TO_AGINO(pag->pag_mount, ino), - XFS_ICI_RECLAIM_TAG); - xfs_perag_clear_reclaim_tag(pag); -} - static inline void xfs_inew_wait( struct xfs_inode *ip) @@ -482,7 +518,9 @@ xfs_iget_cache_hit( */ ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; ip->i_flags |= XFS_INEW; - xfs_inode_clear_reclaim_tag(pag, ip->i_ino); + xfs_perag_clear_inode_tag(pag, + XFS_INO_TO_AGINO(pag->pag_mount, ino), + XFS_ICI_RECLAIM_TAG); inode->i_state = I_NEW; spin_unlock(&ip->i_flags_lock); spin_unlock(&pag->pag_ici_lock); @@ -955,7 +993,7 @@ xfs_reclaim_inode( if (!radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(ip->i_mount, ino))) ASSERT(0); - xfs_perag_clear_reclaim_tag(pag); + xfs_perag_clear_inode_tag(pag, NULLAGINO, XFS_ICI_RECLAIM_TAG); spin_unlock(&pag->pag_ici_lock);
/* @@ -1172,22 +1210,6 @@ xfs_inode_free_eofblocks( return xfs_free_eofblocks(ip); }
-/* - * Background scanning to trim preallocated space. This is queued based on the - * 'speculative_prealloc_lifetime' tunable (5m by default). - */ -static inline void -xfs_blockgc_queue( - struct xfs_perag *pag) -{ - rcu_read_lock(); - if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) - queue_delayed_work(pag->pag_mount->m_gc_workqueue, - &pag->pag_blockgc_work, - msecs_to_jiffies(xfs_blockgc_secs * 1000)); - rcu_read_unlock(); -} - static void xfs_blockgc_set_iflag( struct xfs_inode *ip, @@ -1195,7 +1217,6 @@ xfs_blockgc_set_iflag( { struct xfs_mount *mp = ip->i_mount; struct xfs_perag *pag; - int tagged;
ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
@@ -1212,24 +1233,8 @@ xfs_blockgc_set_iflag( pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); spin_lock(&pag->pag_ici_lock);
- tagged = radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG); - radix_tree_tag_set(&pag->pag_ici_root, - XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), - XFS_ICI_BLOCKGC_TAG); - if (!tagged) { - /* propagate the blockgc tag up into the perag radix tree */ - spin_lock(&ip->i_mount->m_perag_lock); - radix_tree_tag_set(&ip->i_mount->m_perag_tree, - XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), - XFS_ICI_BLOCKGC_TAG); - spin_unlock(&ip->i_mount->m_perag_lock); - - /* kick off background trimming */ - xfs_blockgc_queue(pag); - - trace_xfs_perag_set_blockgc(ip->i_mount, pag->pag_agno, -1, - _RET_IP_); - } + xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), + XFS_ICI_BLOCKGC_TAG);
spin_unlock(&pag->pag_ici_lock); xfs_perag_put(pag); @@ -1265,19 +1270,8 @@ xfs_blockgc_clear_iflag( pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); spin_lock(&pag->pag_ici_lock);
- radix_tree_tag_clear(&pag->pag_ici_root, - XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), - XFS_ICI_BLOCKGC_TAG); - if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) { - /* clear the blockgc tag from the perag radix tree */ - spin_lock(&ip->i_mount->m_perag_lock); - radix_tree_tag_clear(&ip->i_mount->m_perag_tree, - XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), - XFS_ICI_BLOCKGC_TAG); - spin_unlock(&ip->i_mount->m_perag_lock); - trace_xfs_perag_clear_blockgc(ip->i_mount, pag->pag_agno, -1, - _RET_IP_); - } + xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), + XFS_ICI_BLOCKGC_TAG);
spin_unlock(&pag->pag_ici_lock); xfs_perag_put(pag); diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index b6ab1067c52b..191620a069af 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -39,7 +39,7 @@ void xfs_reclaim_inodes(struct xfs_mount *mp); int xfs_reclaim_inodes_count(struct xfs_mount *mp); long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
-void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); +void xfs_inode_mark_reclaimable(struct xfs_inode *ip);
int xfs_blockgc_free_dquots(struct xfs_mount *mp, struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 46e148f5534d..bc2c1926f67f 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -687,7 +687,7 @@ xfs_fs_destroy_inode( * reclaim path handles this more efficiently than we can here, so * simply let background reclaim tear down all inodes. */ - xfs_inode_set_reclaim_tag(ip); + xfs_inode_mark_reclaimable(ip); }
static void diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 0fb82dbc4352..9a328461ce78 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -135,10 +135,8 @@ DEFINE_EVENT(xfs_perag_class, name, \ DEFINE_PERAG_REF_EVENT(xfs_perag_get); DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag); DEFINE_PERAG_REF_EVENT(xfs_perag_put); -DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim); -DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); -DEFINE_PERAG_REF_EVENT(xfs_perag_set_blockgc); -DEFINE_PERAG_REF_EVENT(xfs_perag_clear_blockgc); +DEFINE_PERAG_REF_EVENT(xfs_perag_set_inode_tag); +DEFINE_PERAG_REF_EVENT(xfs_perag_clear_inode_tag);
DECLARE_EVENT_CLASS(xfs_ag_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno),
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.13-rc4 commit 7975e465af6b46e9d0eaf94f764922dc92b28d9c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
When we decide to mark an inode sick, clear the DONTCACHE flag so that the incore inode will be kept around until memory pressure forces it out of memory. This increases the chances that the sick status will be caught by someone compiling a health report later on.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Reviewed-by: Carlos Maiolino cmaiolino@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_health.c | 9 +++++++++ 1 file changed, 9 insertions(+)
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index 8e0cb05a7142..806be8a93ea3 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -231,6 +231,15 @@ xfs_inode_mark_sick( ip->i_sick |= mask; ip->i_checked |= mask; spin_unlock(&ip->i_flags_lock); + + /* + * Keep this inode around so we don't lose the sickness report. Scrub + * grabs inodes with DONTCACHE assuming that most inode are ok, which + * is not the case here. + */ + spin_lock(&VFS_I(ip)->i_lock); + VFS_I(ip)->i_state &= ~I_DONTCACHE; + spin_unlock(&VFS_I(ip)->i_lock); }
/* Mark parts of an inode healed. */
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.13-rc4 commit 9492750a8b18f02a8dec2aab594c59aabe2e4d0d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
It's important that the filesystem retain its memory of sick inodes for a little while after problems are found so that reports can be collected about what was wrong. Don't let inode reclamation free sick inodes unless we're unmounting or the fs already went down.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Reviewed-by: Carlos Maiolino cmaiolino@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 45 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 6 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 7916627c8613..02e837e6234e 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -71,10 +71,13 @@ static int xfs_icwalk_ag(struct xfs_perag *pag, /* Stop scanning after icw_scan_limit inodes. */ #define XFS_ICWALK_FLAG_SCAN_LIMIT (1U << 28)
+#define XFS_ICWALK_FLAG_RECLAIM_SICK (1U << 27) + #define XFS_ICWALK_PRIVATE_FLAGS (XFS_ICWALK_FLAG_DROP_UDQUOT | \ XFS_ICWALK_FLAG_DROP_GDQUOT | \ XFS_ICWALK_FLAG_DROP_PDQUOT | \ - XFS_ICWALK_FLAG_SCAN_LIMIT) + XFS_ICWALK_FLAG_SCAN_LIMIT | \ + XFS_ICWALK_FLAG_RECLAIM_SICK)
/* * Allocate and initialise an xfs_inode. @@ -909,7 +912,8 @@ xfs_dqrele_all_inodes( */ static bool xfs_reclaim_igrab( - struct xfs_inode *ip) + struct xfs_inode *ip, + struct xfs_eofblocks *eofb) { ASSERT(rcu_read_lock_held());
@@ -920,6 +924,14 @@ xfs_reclaim_igrab( spin_unlock(&ip->i_flags_lock); return false; } + + /* Don't reclaim a sick inode unless the caller asked for it. */ + if (ip->i_sick && + (!eofb || !(eofb->eof_flags & XFS_ICWALK_FLAG_RECLAIM_SICK))) { + spin_unlock(&ip->i_flags_lock); + return false; + } + __xfs_iflags_set(ip, XFS_IRECLAIM); spin_unlock(&ip->i_flags_lock); return true; @@ -1020,13 +1032,30 @@ xfs_reclaim_inode( xfs_iflags_clear(ip, XFS_IRECLAIM); }
+/* Reclaim sick inodes if we're unmounting or the fs went down. */ +static inline bool +xfs_want_reclaim_sick( + struct xfs_mount *mp) +{ + return (mp->m_flags & XFS_MOUNT_UNMOUNTING) || + (mp->m_flags & XFS_MOUNT_NORECOVERY) || + XFS_FORCED_SHUTDOWN(mp); +} + void xfs_reclaim_inodes( struct xfs_mount *mp) { + struct xfs_eofblocks eofb = { + .eof_flags = 0, + }; + + if (xfs_want_reclaim_sick(mp)) + eofb.eof_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK; + while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { xfs_ail_push_all_sync(mp->m_ail); - xfs_icwalk(mp, XFS_ICWALK_RECLAIM, NULL); + xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &eofb); } }
@@ -1047,6 +1076,9 @@ xfs_reclaim_inodes_nr( .icw_scan_limit = nr_to_scan, };
+ if (xfs_want_reclaim_sick(mp)) + eofb.eof_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK; + /* kick background reclaimer and push the AIL */ xfs_reclaim_work_queue(mp); xfs_ail_push_all(mp->m_ail); @@ -1605,7 +1637,8 @@ xfs_blockgc_free_quota( static inline bool xfs_icwalk_igrab( enum xfs_icwalk_goal goal, - struct xfs_inode *ip) + struct xfs_inode *ip, + struct xfs_eofblocks *eofb) { switch (goal) { case XFS_ICWALK_DQRELE: @@ -1613,7 +1646,7 @@ xfs_icwalk_igrab( case XFS_ICWALK_BLOCKGC: return xfs_blockgc_igrab(ip); case XFS_ICWALK_RECLAIM: - return xfs_reclaim_igrab(ip); + return xfs_reclaim_igrab(ip, eofb); default: return false; } @@ -1702,7 +1735,7 @@ xfs_icwalk_ag( for (i = 0; i < nr_found; i++) { struct xfs_inode *ip = batch[i];
- if (done || !xfs_icwalk_igrab(goal, ip)) + if (done || !xfs_icwalk_igrab(goal, ip, eofb)) batch[i] = NULL;
/*
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.13-rc4 commit 2d53f66baffde66fe72c360e3b9b0c8a2d7ce7c6 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
In preparation for renaming struct xfs_eofblocks to struct xfs_icwalk, change the prefix of the existing XFS_EOF_FLAGS_* flags to XFS_ICWALK_FLAG_ and convert all the existing users. This adds a degree of interface separation between the ioctl definitions and the incore parameters. Since FLAGS_UNION is only used in xfs_icache.c, move it there as a private flag.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Reviewed-by: Carlos Maiolino cmaiolino@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_file.c | 4 ++-- fs/xfs/xfs_icache.c | 44 +++++++++++++++++++++++--------------------- fs/xfs/xfs_icache.h | 17 +++++++++++++++-- fs/xfs/xfs_ioctl.c | 13 ++++++++++++- 4 files changed, 52 insertions(+), 26 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 139df7fde2e6..7264b6d11f42 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -701,7 +701,7 @@ xfs_file_buffered_aio_write( */ if (ret == -EDQUOT && !cleared_space) { xfs_iunlock(ip, iolock); - xfs_blockgc_free_quota(ip, XFS_EOF_FLAGS_SYNC); + xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC); cleared_space = true; goto write_retry; } else if (ret == -ENOSPC && !cleared_space) { @@ -711,7 +711,7 @@ xfs_file_buffered_aio_write( xfs_flush_inodes(ip->i_mount);
xfs_iunlock(ip, iolock); - eofb.eof_flags = XFS_EOF_FLAGS_SYNC; + eofb.eof_flags = XFS_ICWALK_FLAG_SYNC; xfs_blockgc_free_space(ip->i_mount, &eofb); goto write_retry; } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 02e837e6234e..986d191b04c5 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -62,7 +62,7 @@ static int xfs_icwalk_ag(struct xfs_perag *pag,
/* * Private inode cache walk flags for struct xfs_eofblocks. Must not coincide - * with XFS_EOF_FLAGS_*. + * with XFS_ICWALK_FLAGS_VALID. */ #define XFS_ICWALK_FLAG_DROP_UDQUOT (1U << 31) #define XFS_ICWALK_FLAG_DROP_GDQUOT (1U << 30) @@ -72,12 +72,14 @@ static int xfs_icwalk_ag(struct xfs_perag *pag, #define XFS_ICWALK_FLAG_SCAN_LIMIT (1U << 28)
#define XFS_ICWALK_FLAG_RECLAIM_SICK (1U << 27) +#define XFS_ICWALK_FLAG_UNION (1U << 26) /* union filter algorithm */
#define XFS_ICWALK_PRIVATE_FLAGS (XFS_ICWALK_FLAG_DROP_UDQUOT | \ XFS_ICWALK_FLAG_DROP_GDQUOT | \ XFS_ICWALK_FLAG_DROP_PDQUOT | \ XFS_ICWALK_FLAG_SCAN_LIMIT | \ - XFS_ICWALK_FLAG_RECLAIM_SICK) + XFS_ICWALK_FLAG_RECLAIM_SICK | \ + XFS_ICWALK_FLAG_UNION)
/* * Allocate and initialise an xfs_inode. @@ -1112,15 +1114,15 @@ xfs_inode_match_id( struct xfs_inode *ip, struct xfs_eofblocks *eofb) { - if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && + if ((eofb->eof_flags & XFS_ICWALK_FLAG_UID) && !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) return false;
- if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && + if ((eofb->eof_flags & XFS_ICWALK_FLAG_GID) && !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) return false;
- if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && + if ((eofb->eof_flags & XFS_ICWALK_FLAG_PRID) && ip->i_d.di_projid != eofb->eof_prid) return false;
@@ -1136,15 +1138,15 @@ xfs_inode_match_id_union( struct xfs_inode *ip, struct xfs_eofblocks *eofb) { - if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && + if ((eofb->eof_flags & XFS_ICWALK_FLAG_UID) && uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) return true;
- if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && + if ((eofb->eof_flags & XFS_ICWALK_FLAG_GID) && gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) return true;
- if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && + if ((eofb->eof_flags & XFS_ICWALK_FLAG_PRID) && ip->i_d.di_projid == eofb->eof_prid) return true;
@@ -1166,7 +1168,7 @@ xfs_inode_matches_eofb( if (!eofb) return true;
- if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) + if (eofb->eof_flags & XFS_ICWALK_FLAG_UNION) match = xfs_inode_match_id_union(ip, eofb); else match = xfs_inode_match_id(ip, eofb); @@ -1174,7 +1176,7 @@ xfs_inode_matches_eofb( return false;
/* skip the inode if the file size is too small */ - if ((eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE) && + if ((eofb->eof_flags & XFS_ICWALK_FLAG_MINFILESIZE) && XFS_ISIZE(ip) < eofb->eof_min_file_size) return false;
@@ -1206,7 +1208,7 @@ xfs_inode_free_eofblocks( { bool wait;
- wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC); + wait = eofb && (eofb->eof_flags & XFS_ICWALK_FLAG_SYNC);
if (!xfs_iflags_test(ip, XFS_IEOFBLOCKS)) return 0; @@ -1370,7 +1372,7 @@ xfs_inode_free_cowblocks( bool wait; int ret = 0;
- wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC); + wait = eofb && (eofb->eof_flags & XFS_ICWALK_FLAG_SYNC);
if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS)) return 0; @@ -1560,7 +1562,7 @@ xfs_blockgc_free_space( * scan. * * Callers must not hold any inode's ILOCK. If requesting a synchronous scan - * (XFS_EOF_FLAGS_SYNC), the caller also must not hold any inode's IOLOCK or + * (XFS_ICWALK_FLAG_SYNC), the caller also must not hold any inode's IOLOCK or * MMAPLOCK. */ int @@ -1569,7 +1571,7 @@ xfs_blockgc_free_dquots( struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, - unsigned int eof_flags) + unsigned int iwalk_flags) { struct xfs_eofblocks eofb = {0}; bool do_work = false; @@ -1581,23 +1583,23 @@ xfs_blockgc_free_dquots( * Run a scan to free blocks using the union filter to cover all * applicable quotas in a single scan. */ - eofb.eof_flags = XFS_EOF_FLAGS_UNION | eof_flags; + eofb.eof_flags = XFS_ICWALK_FLAG_UNION | iwalk_flags;
if (XFS_IS_UQUOTA_ENFORCED(mp) && udqp && xfs_dquot_lowsp(udqp)) { eofb.eof_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id); - eofb.eof_flags |= XFS_EOF_FLAGS_UID; + eofb.eof_flags |= XFS_ICWALK_FLAG_UID; do_work = true; }
if (XFS_IS_UQUOTA_ENFORCED(mp) && gdqp && xfs_dquot_lowsp(gdqp)) { eofb.eof_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id); - eofb.eof_flags |= XFS_EOF_FLAGS_GID; + eofb.eof_flags |= XFS_ICWALK_FLAG_GID; do_work = true; }
if (XFS_IS_PQUOTA_ENFORCED(mp) && pdqp && xfs_dquot_lowsp(pdqp)) { eofb.eof_prid = pdqp->q_id; - eofb.eof_flags |= XFS_EOF_FLAGS_PRID; + eofb.eof_flags |= XFS_ICWALK_FLAG_PRID; do_work = true; }
@@ -1611,12 +1613,12 @@ xfs_blockgc_free_dquots( int xfs_blockgc_free_quota( struct xfs_inode *ip, - unsigned int eof_flags) + unsigned int iwalk_flags) { return xfs_blockgc_free_dquots(ip->i_mount, xfs_inode_dquot(ip, XFS_DQTYPE_USER), xfs_inode_dquot(ip, XFS_DQTYPE_GROUP), - xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), eof_flags); + xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), iwalk_flags); }
/* XFS Inode Cache Walking Code */ @@ -1836,5 +1838,5 @@ xfs_icwalk( } } return last_error; - BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_EOF_FLAGS_VALID); + BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_ICWALK_FLAGS_VALID); } diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 191620a069af..b29048c493b6 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -18,6 +18,19 @@ struct xfs_eofblocks { int icw_scan_limit; };
+/* Flags that reflect xfs_fs_eofblocks functionality. */ +#define XFS_ICWALK_FLAG_SYNC (1U << 0) /* sync/wait mode scan */ +#define XFS_ICWALK_FLAG_UID (1U << 1) /* filter by uid */ +#define XFS_ICWALK_FLAG_GID (1U << 2) /* filter by gid */ +#define XFS_ICWALK_FLAG_PRID (1U << 3) /* filter by project id */ +#define XFS_ICWALK_FLAG_MINFILESIZE (1U << 4) /* filter by min file size */ + +#define XFS_ICWALK_FLAGS_VALID (XFS_ICWALK_FLAG_SYNC | \ + XFS_ICWALK_FLAG_UID | \ + XFS_ICWALK_FLAG_GID | \ + XFS_ICWALK_FLAG_PRID | \ + XFS_ICWALK_FLAG_MINFILESIZE) + /* * Flags for xfs_iget() */ @@ -43,8 +56,8 @@ void xfs_inode_mark_reclaimable(struct xfs_inode *ip);
int xfs_blockgc_free_dquots(struct xfs_mount *mp, struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, - unsigned int eof_flags); -int xfs_blockgc_free_quota(struct xfs_inode *ip, unsigned int eof_flags); + unsigned int iwalk_flags); +int xfs_blockgc_free_quota(struct xfs_inode *ip, unsigned int iwalk_flags); int xfs_blockgc_free_space(struct xfs_mount *mp, struct xfs_eofblocks *eofb);
void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 57cdfac0e7d3..4efb49412afe 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -2038,7 +2038,18 @@ xfs_fs_eofblocks_from_user( memchr_inv(src->pad64, 0, sizeof(src->pad64))) return -EINVAL;
- dst->eof_flags = src->eof_flags; + dst->eof_flags = 0; + if (src->eof_flags & XFS_EOF_FLAGS_SYNC) + dst->eof_flags |= XFS_ICWALK_FLAG_SYNC; + if (src->eof_flags & XFS_EOF_FLAGS_UID) + dst->eof_flags |= XFS_ICWALK_FLAG_UID; + if (src->eof_flags & XFS_EOF_FLAGS_GID) + dst->eof_flags |= XFS_ICWALK_FLAG_GID; + if (src->eof_flags & XFS_EOF_FLAGS_PRID) + dst->eof_flags |= XFS_ICWALK_FLAG_PRID; + if (src->eof_flags & XFS_EOF_FLAGS_MINFILESIZE) + dst->eof_flags |= XFS_ICWALK_FLAG_MINFILESIZE; + dst->eof_prid = src->eof_prid; dst->eof_min_file_size = src->eof_min_file_size;
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.13-rc4 commit b26b2bf14f823e9597118c01993aeba9aeb9a701 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
The xfs_eofblocks structure is no longer well-named -- nowadays it provides optional filtering criteria to any walk of the incore inode cache. Only one of the cache walk goals has anything to do with clearing of speculative post-EOF preallocations, so change the name to be more appropriate.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_file.c | 6 +- fs/xfs/xfs_icache.c | 164 ++++++++++++++++++++++---------------------- fs/xfs/xfs_icache.h | 14 ++-- fs/xfs/xfs_ioctl.c | 40 +++++------ fs/xfs/xfs_trace.h | 36 +++++----- 5 files changed, 130 insertions(+), 130 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 7264b6d11f42..c6845b503667 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -705,14 +705,14 @@ xfs_file_buffered_aio_write( cleared_space = true; goto write_retry; } else if (ret == -ENOSPC && !cleared_space) { - struct xfs_eofblocks eofb = {0}; + struct xfs_icwalk icw = {0};
cleared_space = true; xfs_flush_inodes(ip->i_mount);
xfs_iunlock(ip, iolock); - eofb.eof_flags = XFS_ICWALK_FLAG_SYNC; - xfs_blockgc_free_space(ip->i_mount, &eofb); + icw.icw_flags = XFS_ICWALK_FLAG_SYNC; + xfs_blockgc_free_space(ip->i_mount, &icw); goto write_retry; }
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 986d191b04c5..84f408fabb95 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -56,13 +56,13 @@ xfs_icwalk_tag(enum xfs_icwalk_goal goal) }
static int xfs_icwalk(struct xfs_mount *mp, - enum xfs_icwalk_goal goal, struct xfs_eofblocks *eofb); + enum xfs_icwalk_goal goal, struct xfs_icwalk *icw); static int xfs_icwalk_ag(struct xfs_perag *pag, - enum xfs_icwalk_goal goal, struct xfs_eofblocks *eofb); + enum xfs_icwalk_goal goal, struct xfs_icwalk *icw);
/* - * Private inode cache walk flags for struct xfs_eofblocks. Must not coincide - * with XFS_ICWALK_FLAGS_VALID. + * Private inode cache walk flags for struct xfs_icwalk. Must not + * coincide with XFS_ICWALK_FLAGS_VALID. */ #define XFS_ICWALK_FLAG_DROP_UDQUOT (1U << 31) #define XFS_ICWALK_FLAG_DROP_GDQUOT (1U << 30) @@ -847,21 +847,21 @@ xfs_dqrele_igrab( static void xfs_dqrele_inode( struct xfs_inode *ip, - struct xfs_eofblocks *eofb) + struct xfs_icwalk *icw) { if (xfs_iflags_test(ip, XFS_INEW)) xfs_inew_wait(ip);
xfs_ilock(ip, XFS_ILOCK_EXCL); - if (eofb->eof_flags & XFS_ICWALK_FLAG_DROP_UDQUOT) { + if (icw->icw_flags & XFS_ICWALK_FLAG_DROP_UDQUOT) { xfs_qm_dqrele(ip->i_udquot); ip->i_udquot = NULL; } - if (eofb->eof_flags & XFS_ICWALK_FLAG_DROP_GDQUOT) { + if (icw->icw_flags & XFS_ICWALK_FLAG_DROP_GDQUOT) { xfs_qm_dqrele(ip->i_gdquot); ip->i_gdquot = NULL; } - if (eofb->eof_flags & XFS_ICWALK_FLAG_DROP_PDQUOT) { + if (icw->icw_flags & XFS_ICWALK_FLAG_DROP_PDQUOT) { xfs_qm_dqrele(ip->i_pdquot); ip->i_pdquot = NULL; } @@ -879,16 +879,16 @@ xfs_dqrele_all_inodes( struct xfs_mount *mp, unsigned int qflags) { - struct xfs_eofblocks eofb = { .eof_flags = 0 }; + struct xfs_icwalk icw = { .icw_flags = 0 };
if (qflags & XFS_UQUOTA_ACCT) - eofb.eof_flags |= XFS_ICWALK_FLAG_DROP_UDQUOT; + icw.icw_flags |= XFS_ICWALK_FLAG_DROP_UDQUOT; if (qflags & XFS_GQUOTA_ACCT) - eofb.eof_flags |= XFS_ICWALK_FLAG_DROP_GDQUOT; + icw.icw_flags |= XFS_ICWALK_FLAG_DROP_GDQUOT; if (qflags & XFS_PQUOTA_ACCT) - eofb.eof_flags |= XFS_ICWALK_FLAG_DROP_PDQUOT; + icw.icw_flags |= XFS_ICWALK_FLAG_DROP_PDQUOT;
- return xfs_icwalk(mp, XFS_ICWALK_DQRELE, &eofb); + return xfs_icwalk(mp, XFS_ICWALK_DQRELE, &icw); } #else # define xfs_dqrele_igrab(ip) (false) @@ -915,7 +915,7 @@ xfs_dqrele_all_inodes( static bool xfs_reclaim_igrab( struct xfs_inode *ip, - struct xfs_eofblocks *eofb) + struct xfs_icwalk *icw) { ASSERT(rcu_read_lock_held());
@@ -929,7 +929,7 @@ xfs_reclaim_igrab(
/* Don't reclaim a sick inode unless the caller asked for it. */ if (ip->i_sick && - (!eofb || !(eofb->eof_flags & XFS_ICWALK_FLAG_RECLAIM_SICK))) { + (!icw || !(icw->icw_flags & XFS_ICWALK_FLAG_RECLAIM_SICK))) { spin_unlock(&ip->i_flags_lock); return false; } @@ -1048,16 +1048,16 @@ void xfs_reclaim_inodes( struct xfs_mount *mp) { - struct xfs_eofblocks eofb = { - .eof_flags = 0, + struct xfs_icwalk icw = { + .icw_flags = 0, };
if (xfs_want_reclaim_sick(mp)) - eofb.eof_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK; + icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { xfs_ail_push_all_sync(mp->m_ail); - xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &eofb); + xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw); } }
@@ -1073,19 +1073,19 @@ xfs_reclaim_inodes_nr( struct xfs_mount *mp, int nr_to_scan) { - struct xfs_eofblocks eofb = { - .eof_flags = XFS_ICWALK_FLAG_SCAN_LIMIT, + struct xfs_icwalk icw = { + .icw_flags = XFS_ICWALK_FLAG_SCAN_LIMIT, .icw_scan_limit = nr_to_scan, };
if (xfs_want_reclaim_sick(mp)) - eofb.eof_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK; + icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
/* kick background reclaimer and push the AIL */ xfs_reclaim_work_queue(mp); xfs_ail_push_all(mp->m_ail);
- xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &eofb); + xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw); return 0; }
@@ -1110,20 +1110,20 @@ xfs_reclaim_inodes_count( }
STATIC bool -xfs_inode_match_id( +xfs_icwalk_match_id( struct xfs_inode *ip, - struct xfs_eofblocks *eofb) + struct xfs_icwalk *icw) { - if ((eofb->eof_flags & XFS_ICWALK_FLAG_UID) && - !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) + if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) && + !uid_eq(VFS_I(ip)->i_uid, icw->icw_uid)) return false;
- if ((eofb->eof_flags & XFS_ICWALK_FLAG_GID) && - !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) + if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) && + !gid_eq(VFS_I(ip)->i_gid, icw->icw_gid)) return false;
- if ((eofb->eof_flags & XFS_ICWALK_FLAG_PRID) && - ip->i_d.di_projid != eofb->eof_prid) + if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) && + ip->i_d.di_projid != icw->icw_prid) return false;
return true; @@ -1134,20 +1134,20 @@ xfs_inode_match_id( * criteria match. This is for global/internal scans only. */ STATIC bool -xfs_inode_match_id_union( +xfs_icwalk_match_id_union( struct xfs_inode *ip, - struct xfs_eofblocks *eofb) + struct xfs_icwalk *icw) { - if ((eofb->eof_flags & XFS_ICWALK_FLAG_UID) && - uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) + if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) && + uid_eq(VFS_I(ip)->i_uid, icw->icw_uid)) return true;
- if ((eofb->eof_flags & XFS_ICWALK_FLAG_GID) && - gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) + if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) && + gid_eq(VFS_I(ip)->i_gid, icw->icw_gid)) return true;
- if ((eofb->eof_flags & XFS_ICWALK_FLAG_PRID) && - ip->i_d.di_projid == eofb->eof_prid) + if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) && + ip->i_d.di_projid == icw->icw_prid) return true;
return false; @@ -1155,29 +1155,29 @@ xfs_inode_match_id_union(
/* * Is this inode @ip eligible for eof/cow block reclamation, given some - * filtering parameters @eofb? The inode is eligible if @eofb is null or + * filtering parameters @icw? The inode is eligible if @icw is null or * if the predicate functions match. */ static bool -xfs_inode_matches_eofb( +xfs_icwalk_match( struct xfs_inode *ip, - struct xfs_eofblocks *eofb) + struct xfs_icwalk *icw) { bool match;
- if (!eofb) + if (!icw) return true;
- if (eofb->eof_flags & XFS_ICWALK_FLAG_UNION) - match = xfs_inode_match_id_union(ip, eofb); + if (icw->icw_flags & XFS_ICWALK_FLAG_UNION) + match = xfs_icwalk_match_id_union(ip, icw); else - match = xfs_inode_match_id(ip, eofb); + match = xfs_icwalk_match_id(ip, icw); if (!match) return false;
/* skip the inode if the file size is too small */ - if ((eofb->eof_flags & XFS_ICWALK_FLAG_MINFILESIZE) && - XFS_ISIZE(ip) < eofb->eof_min_file_size) + if ((icw->icw_flags & XFS_ICWALK_FLAG_MINFILESIZE) && + XFS_ISIZE(ip) < icw->icw_min_file_size) return false;
return true; @@ -1203,12 +1203,12 @@ xfs_reclaim_worker( STATIC int xfs_inode_free_eofblocks( struct xfs_inode *ip, - struct xfs_eofblocks *eofb, + struct xfs_icwalk *icw, unsigned int *lockflags) { bool wait;
- wait = eofb && (eofb->eof_flags & XFS_ICWALK_FLAG_SYNC); + wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
if (!xfs_iflags_test(ip, XFS_IEOFBLOCKS)) return 0; @@ -1227,7 +1227,7 @@ xfs_inode_free_eofblocks( if (!wait && mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) return 0;
- if (!xfs_inode_matches_eofb(ip, eofb)) + if (!xfs_icwalk_match(ip, icw)) return 0;
/* @@ -1366,13 +1366,13 @@ xfs_prep_free_cowblocks( STATIC int xfs_inode_free_cowblocks( struct xfs_inode *ip, - struct xfs_eofblocks *eofb, + struct xfs_icwalk *icw, unsigned int *lockflags) { bool wait; int ret = 0;
- wait = eofb && (eofb->eof_flags & XFS_ICWALK_FLAG_SYNC); + wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS)) return 0; @@ -1380,7 +1380,7 @@ xfs_inode_free_cowblocks( if (!xfs_prep_free_cowblocks(ip)) return 0;
- if (!xfs_inode_matches_eofb(ip, eofb)) + if (!xfs_icwalk_match(ip, icw)) return 0;
/* @@ -1505,16 +1505,16 @@ xfs_blockgc_igrab( static int xfs_blockgc_scan_inode( struct xfs_inode *ip, - struct xfs_eofblocks *eofb) + struct xfs_icwalk *icw) { unsigned int lockflags = 0; int error;
- error = xfs_inode_free_eofblocks(ip, eofb, &lockflags); + error = xfs_inode_free_eofblocks(ip, icw, &lockflags); if (error) goto unlock;
- error = xfs_inode_free_cowblocks(ip, eofb, &lockflags); + error = xfs_inode_free_cowblocks(ip, icw, &lockflags); unlock: if (lockflags) xfs_iunlock(ip, lockflags); @@ -1548,11 +1548,11 @@ xfs_blockgc_worker( int xfs_blockgc_free_space( struct xfs_mount *mp, - struct xfs_eofblocks *eofb) + struct xfs_icwalk *icw) { - trace_xfs_blockgc_free_space(mp, eofb, _RET_IP_); + trace_xfs_blockgc_free_space(mp, icw, _RET_IP_);
- return xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, eofb); + return xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, icw); }
/* @@ -1573,7 +1573,7 @@ xfs_blockgc_free_dquots( struct xfs_dquot *pdqp, unsigned int iwalk_flags) { - struct xfs_eofblocks eofb = {0}; + struct xfs_icwalk icw = {0}; bool do_work = false;
if (!udqp && !gdqp && !pdqp) @@ -1583,30 +1583,30 @@ xfs_blockgc_free_dquots( * Run a scan to free blocks using the union filter to cover all * applicable quotas in a single scan. */ - eofb.eof_flags = XFS_ICWALK_FLAG_UNION | iwalk_flags; + icw.icw_flags = XFS_ICWALK_FLAG_UNION | iwalk_flags;
if (XFS_IS_UQUOTA_ENFORCED(mp) && udqp && xfs_dquot_lowsp(udqp)) { - eofb.eof_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id); - eofb.eof_flags |= XFS_ICWALK_FLAG_UID; + icw.icw_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id); + icw.icw_flags |= XFS_ICWALK_FLAG_UID; do_work = true; }
if (XFS_IS_UQUOTA_ENFORCED(mp) && gdqp && xfs_dquot_lowsp(gdqp)) { - eofb.eof_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id); - eofb.eof_flags |= XFS_ICWALK_FLAG_GID; + icw.icw_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id); + icw.icw_flags |= XFS_ICWALK_FLAG_GID; do_work = true; }
if (XFS_IS_PQUOTA_ENFORCED(mp) && pdqp && xfs_dquot_lowsp(pdqp)) { - eofb.eof_prid = pdqp->q_id; - eofb.eof_flags |= XFS_ICWALK_FLAG_PRID; + icw.icw_prid = pdqp->q_id; + icw.icw_flags |= XFS_ICWALK_FLAG_PRID; do_work = true; }
if (!do_work) return 0;
- return xfs_blockgc_free_space(mp, &eofb); + return xfs_blockgc_free_space(mp, &icw); }
/* Run cow/eofblocks scans on the quotas attached to the inode. */ @@ -1640,7 +1640,7 @@ static inline bool xfs_icwalk_igrab( enum xfs_icwalk_goal goal, struct xfs_inode *ip, - struct xfs_eofblocks *eofb) + struct xfs_icwalk *icw) { switch (goal) { case XFS_ICWALK_DQRELE: @@ -1648,7 +1648,7 @@ xfs_icwalk_igrab( case XFS_ICWALK_BLOCKGC: return xfs_blockgc_igrab(ip); case XFS_ICWALK_RECLAIM: - return xfs_reclaim_igrab(ip, eofb); + return xfs_reclaim_igrab(ip, icw); default: return false; } @@ -1663,16 +1663,16 @@ xfs_icwalk_process_inode( enum xfs_icwalk_goal goal, struct xfs_inode *ip, struct xfs_perag *pag, - struct xfs_eofblocks *eofb) + struct xfs_icwalk *icw) { int error = 0;
switch (goal) { case XFS_ICWALK_DQRELE: - xfs_dqrele_inode(ip, eofb); + xfs_dqrele_inode(ip, icw); break; case XFS_ICWALK_BLOCKGC: - error = xfs_blockgc_scan_inode(ip, eofb); + error = xfs_blockgc_scan_inode(ip, icw); break; case XFS_ICWALK_RECLAIM: xfs_reclaim_inode(ip, pag); @@ -1689,7 +1689,7 @@ static int xfs_icwalk_ag( struct xfs_perag *pag, enum xfs_icwalk_goal goal, - struct xfs_eofblocks *eofb) + struct xfs_icwalk *icw) { struct xfs_mount *mp = pag->pag_mount; uint32_t first_index; @@ -1737,7 +1737,7 @@ xfs_icwalk_ag( for (i = 0; i < nr_found; i++) { struct xfs_inode *ip = batch[i];
- if (done || !xfs_icwalk_igrab(goal, ip, eofb)) + if (done || !xfs_icwalk_igrab(goal, ip, icw)) batch[i] = NULL;
/* @@ -1766,7 +1766,7 @@ xfs_icwalk_ag( if (!batch[i]) continue; error = xfs_icwalk_process_inode(goal, batch[i], pag, - eofb); + icw); if (error == -EAGAIN) { skipped++; continue; @@ -1781,9 +1781,9 @@ xfs_icwalk_ag(
cond_resched();
- if (eofb && (eofb->eof_flags & XFS_ICWALK_FLAG_SCAN_LIMIT)) { - eofb->icw_scan_limit -= XFS_LOOKUP_BATCH; - if (eofb->icw_scan_limit <= 0) + if (icw && (icw->icw_flags & XFS_ICWALK_FLAG_SCAN_LIMIT)) { + icw->icw_scan_limit -= XFS_LOOKUP_BATCH; + if (icw->icw_scan_limit <= 0) break; } } while (nr_found && !done); @@ -1820,7 +1820,7 @@ static int xfs_icwalk( struct xfs_mount *mp, enum xfs_icwalk_goal goal, - struct xfs_eofblocks *eofb) + struct xfs_icwalk *icw) { struct xfs_perag *pag; int error = 0; @@ -1829,7 +1829,7 @@ xfs_icwalk(
while ((pag = xfs_icwalk_get_perag(mp, agno, goal))) { agno = pag->pag_agno + 1; - error = xfs_icwalk_ag(pag, goal, eofb); + error = xfs_icwalk_ag(pag, goal, icw); xfs_perag_put(pag); if (error) { last_error = error; diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index b29048c493b6..00dc98a92835 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -9,12 +9,12 @@ struct xfs_mount; struct xfs_perag;
-struct xfs_eofblocks { - __u32 eof_flags; - kuid_t eof_uid; - kgid_t eof_gid; - prid_t eof_prid; - __u64 eof_min_file_size; +struct xfs_icwalk { + __u32 icw_flags; + kuid_t icw_uid; + kgid_t icw_gid; + prid_t icw_prid; + __u64 icw_min_file_size; int icw_scan_limit; };
@@ -58,7 +58,7 @@ int xfs_blockgc_free_dquots(struct xfs_mount *mp, struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, unsigned int iwalk_flags); int xfs_blockgc_free_quota(struct xfs_inode *ip, unsigned int iwalk_flags); -int xfs_blockgc_free_space(struct xfs_mount *mp, struct xfs_eofblocks *eofb); +int xfs_blockgc_free_space(struct xfs_mount *mp, struct xfs_icwalk *icm);
void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 4efb49412afe..5eb78915412a 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -2026,7 +2026,7 @@ xfs_ioc_setlabel( static inline int xfs_fs_eofblocks_from_user( struct xfs_fs_eofblocks *src, - struct xfs_eofblocks *dst) + struct xfs_icwalk *dst) { if (src->eof_version != XFS_EOFBLOCKS_VERSION) return -EINVAL; @@ -2038,32 +2038,32 @@ xfs_fs_eofblocks_from_user( memchr_inv(src->pad64, 0, sizeof(src->pad64))) return -EINVAL;
- dst->eof_flags = 0; + dst->icw_flags = 0; if (src->eof_flags & XFS_EOF_FLAGS_SYNC) - dst->eof_flags |= XFS_ICWALK_FLAG_SYNC; + dst->icw_flags |= XFS_ICWALK_FLAG_SYNC; if (src->eof_flags & XFS_EOF_FLAGS_UID) - dst->eof_flags |= XFS_ICWALK_FLAG_UID; + dst->icw_flags |= XFS_ICWALK_FLAG_UID; if (src->eof_flags & XFS_EOF_FLAGS_GID) - dst->eof_flags |= XFS_ICWALK_FLAG_GID; + dst->icw_flags |= XFS_ICWALK_FLAG_GID; if (src->eof_flags & XFS_EOF_FLAGS_PRID) - dst->eof_flags |= XFS_ICWALK_FLAG_PRID; + dst->icw_flags |= XFS_ICWALK_FLAG_PRID; if (src->eof_flags & XFS_EOF_FLAGS_MINFILESIZE) - dst->eof_flags |= XFS_ICWALK_FLAG_MINFILESIZE; + dst->icw_flags |= XFS_ICWALK_FLAG_MINFILESIZE;
- dst->eof_prid = src->eof_prid; - dst->eof_min_file_size = src->eof_min_file_size; + dst->icw_prid = src->eof_prid; + dst->icw_min_file_size = src->eof_min_file_size;
- dst->eof_uid = INVALID_UID; + dst->icw_uid = INVALID_UID; if (src->eof_flags & XFS_EOF_FLAGS_UID) { - dst->eof_uid = make_kuid(current_user_ns(), src->eof_uid); - if (!uid_valid(dst->eof_uid)) + dst->icw_uid = make_kuid(current_user_ns(), src->eof_uid); + if (!uid_valid(dst->icw_uid)) return -EINVAL; }
- dst->eof_gid = INVALID_GID; + dst->icw_gid = INVALID_GID; if (src->eof_flags & XFS_EOF_FLAGS_GID) { - dst->eof_gid = make_kgid(current_user_ns(), src->eof_gid); - if (!gid_valid(dst->eof_gid)) + dst->icw_gid = make_kgid(current_user_ns(), src->eof_gid); + if (!gid_valid(dst->icw_gid)) return -EINVAL; } return 0; @@ -2334,8 +2334,8 @@ xfs_file_ioctl( return xfs_errortag_clearall(mp);
case XFS_IOC_FREE_EOFBLOCKS: { - struct xfs_fs_eofblocks eofb; - struct xfs_eofblocks keofb; + struct xfs_fs_eofblocks eofb; + struct xfs_icwalk icw;
if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -2346,14 +2346,14 @@ xfs_file_ioctl( if (copy_from_user(&eofb, arg, sizeof(eofb))) return -EFAULT;
- error = xfs_fs_eofblocks_from_user(&eofb, &keofb); + error = xfs_fs_eofblocks_from_user(&eofb, &icw); if (error) return error;
- trace_xfs_ioc_free_eofblocks(mp, &keofb, _RET_IP_); + trace_xfs_ioc_free_eofblocks(mp, &icw, _RET_IP_);
sb_start_write(mp->m_super); - error = xfs_blockgc_free_space(mp, &keofb); + error = xfs_blockgc_free_space(mp, &icw); sb_end_write(mp->m_super); return error; } diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 9a328461ce78..4bce2fc9cbdc 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -37,7 +37,7 @@ struct xfs_trans_res; struct xfs_inobt_rec_incore; union xfs_btree_ptr; struct xfs_dqtrx; -struct xfs_eofblocks; +struct xfs_icwalk;
#define XFS_ATTR_FILTER_FLAGS \ { XFS_ATTR_ROOT, "ROOT" }, \ @@ -3865,10 +3865,10 @@ DEFINE_EVENT(xfs_timestamp_range_class, name, \ DEFINE_TIMESTAMP_RANGE_EVENT(xfs_inode_timestamp_range); DEFINE_TIMESTAMP_RANGE_EVENT(xfs_quota_expiry_range);
-DECLARE_EVENT_CLASS(xfs_eofblocks_class, - TP_PROTO(struct xfs_mount *mp, struct xfs_eofblocks *eofb, +DECLARE_EVENT_CLASS(xfs_icwalk_class, + TP_PROTO(struct xfs_mount *mp, struct xfs_icwalk *icw, unsigned long caller_ip), - TP_ARGS(mp, eofb, caller_ip), + TP_ARGS(mp, icw, caller_ip), TP_STRUCT__entry( __field(dev_t, dev) __field(__u32, flags) @@ -3881,14 +3881,14 @@ DECLARE_EVENT_CLASS(xfs_eofblocks_class, ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; - __entry->flags = eofb ? eofb->eof_flags : 0; - __entry->uid = eofb ? from_kuid(mp->m_super->s_user_ns, - eofb->eof_uid) : 0; - __entry->gid = eofb ? from_kgid(mp->m_super->s_user_ns, - eofb->eof_gid) : 0; - __entry->prid = eofb ? eofb->eof_prid : 0; - __entry->min_file_size = eofb ? eofb->eof_min_file_size : 0; - __entry->scan_limit = eofb ? eofb->icw_scan_limit : 0; + __entry->flags = icw ? icw->icw_flags : 0; + __entry->uid = icw ? from_kuid(mp->m_super->s_user_ns, + icw->icw_uid) : 0; + __entry->gid = icw ? from_kgid(mp->m_super->s_user_ns, + icw->icw_gid) : 0; + __entry->prid = icw ? icw->icw_prid : 0; + __entry->min_file_size = icw ? icw->icw_min_file_size : 0; + __entry->scan_limit = icw ? icw->icw_scan_limit : 0; __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d flags 0x%x uid %u gid %u prid %u minsize %llu scan_limit %d caller %pS", @@ -3901,13 +3901,13 @@ DECLARE_EVENT_CLASS(xfs_eofblocks_class, __entry->scan_limit, (char *)__entry->caller_ip) ); -#define DEFINE_EOFBLOCKS_EVENT(name) \ -DEFINE_EVENT(xfs_eofblocks_class, name, \ - TP_PROTO(struct xfs_mount *mp, struct xfs_eofblocks *eofb, \ +#define DEFINE_ICWALK_EVENT(name) \ +DEFINE_EVENT(xfs_icwalk_class, name, \ + TP_PROTO(struct xfs_mount *mp, struct xfs_icwalk *icw, \ unsigned long caller_ip), \ - TP_ARGS(mp, eofb, caller_ip)) -DEFINE_EOFBLOCKS_EVENT(xfs_ioc_free_eofblocks); -DEFINE_EOFBLOCKS_EVENT(xfs_blockgc_free_space); + TP_ARGS(mp, icw, caller_ip)) +DEFINE_ICWALK_EVENT(xfs_ioc_free_eofblocks); +DEFINE_ICWALK_EVENT(xfs_blockgc_free_space);
#endif /* _TRACE_XFS_H */
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.13-rc4 commit 956f6daa84bf50dd5bd13a64b57cae446bca3899 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
For the DEBUGS!
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_log.c | 18 +++++++++++++ fs/xfs/xfs_log_priv.h | 10 ++++++++ fs/xfs/xfs_trace.h | 60 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 88 insertions(+)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 46cb093e365f..205bce9cf6f9 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -480,6 +480,7 @@ xlog_state_release_iclog( xfs_lsn_t tail_lsn; lockdep_assert_held(&log->l_icloglock);
+ trace_xlog_iclog_release(iclog, _RET_IP_); if (iclog->ic_state == XLOG_STATE_IOERROR) return -EIO;
@@ -507,6 +508,7 @@ xlog_state_release_iclog( iclog->ic_state = XLOG_STATE_SYNCING; iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); xlog_verify_tail_lsn(log, iclog, tail_lsn); + trace_xlog_iclog_syncing(iclog, _RET_IP_);
spin_unlock(&log->l_icloglock); xlog_sync(log, iclog); @@ -780,6 +782,7 @@ xlog_wait_on_iclog( { struct xlog *log = iclog->ic_log;
+ trace_xlog_iclog_wait_on(iclog, _RET_IP_); if (!XLOG_FORCED_SHUTDOWN(log) && iclog->ic_state != XLOG_STATE_ACTIVE && iclog->ic_state != XLOG_STATE_DIRTY) { @@ -1679,6 +1682,7 @@ xlog_write_iclog( unsigned int count) { ASSERT(bno < log->l_logBBsize); + trace_xlog_iclog_write(iclog, _RET_IP_);
/* * We lock the iclogbufs here so that we can serialise against I/O @@ -1834,6 +1838,7 @@ xlog_sync( unsigned int size;
ASSERT(atomic_read(&iclog->ic_refcnt) == 0); + trace_xlog_iclog_sync(iclog, _RET_IP_);
count = xlog_calc_iclog_size(log, iclog, &roundoff);
@@ -2471,6 +2476,7 @@ xlog_state_activate_iclog( int *iclogs_changed) { ASSERT(list_empty_careful(&iclog->ic_callbacks)); + trace_xlog_iclog_activate(iclog, _RET_IP_);
/* * If the number of ops in this iclog indicate it just contains the @@ -2561,6 +2567,8 @@ xlog_state_clean_iclog( { int iclogs_changed = 0;
+ trace_xlog_iclog_clean(dirty_iclog, _RET_IP_); + dirty_iclog->ic_state = XLOG_STATE_DIRTY;
xlog_state_activate_iclogs(log, &iclogs_changed); @@ -2620,6 +2628,7 @@ xlog_state_set_callback( struct xlog_in_core *iclog, xfs_lsn_t header_lsn) { + trace_xlog_iclog_callback(iclog, _RET_IP_); iclog->ic_state = XLOG_STATE_CALLBACK;
ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), @@ -2727,7 +2736,9 @@ xlog_state_do_callback( list_splice_init(&iclog->ic_callbacks, &cb_list); spin_unlock(&log->l_icloglock);
+ trace_xlog_iclog_callbacks_start(iclog, _RET_IP_); xlog_cil_process_committed(&cb_list); + trace_xlog_iclog_callbacks_done(iclog, _RET_IP_); cycled_icloglock = true;
spin_lock(&log->l_icloglock); @@ -2776,6 +2787,7 @@ xlog_state_done_syncing(
spin_lock(&log->l_icloglock); ASSERT(atomic_read(&iclog->ic_refcnt) == 0); + trace_xlog_iclog_sync_done(iclog, _RET_IP_);
/* * If we got an error, either on the first buffer, or in the case of @@ -2849,6 +2861,8 @@ xlog_state_get_iclog_space( atomic_inc(&iclog->ic_refcnt); /* prevents sync */ log_offset = iclog->ic_offset;
+ trace_xlog_iclog_get_space(iclog, _RET_IP_); + /* On the 1st write to an iclog, figure out lsn. This works * if iclogs marked XLOG_STATE_WANT_SYNC always write out what they are * committing to. If the offset is set, that's how many blocks @@ -3014,6 +3028,7 @@ xlog_state_switch_iclogs( { ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); assert_spin_locked(&log->l_icloglock); + trace_xlog_iclog_switch(iclog, _RET_IP_);
if (!eventual_size) eventual_size = iclog->ic_offset; @@ -3124,6 +3139,8 @@ xfs_log_force( if (iclog->ic_state == XLOG_STATE_IOERROR) goto out_error;
+ trace_xlog_iclog_force(iclog, _RET_IP_); + if (iclog->ic_state == XLOG_STATE_DIRTY || (iclog->ic_state == XLOG_STATE_ACTIVE && atomic_read(&iclog->ic_refcnt) == 0 && iclog->ic_offset == 0)) { @@ -3192,6 +3209,7 @@ xlog_force_lsn( goto out_error;
while (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) { + trace_xlog_iclog_force_lsn(iclog, _RET_IP_); iclog = iclog->ic_next; if (iclog == log->l_iclog) goto out_unlock; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 2c1ae43ac667..7cbde0b4f990 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -50,6 +50,16 @@ enum xlog_iclog_state { XLOG_STATE_IOERROR, /* IO error happened in sync'ing log */ };
+#define XLOG_STATE_STRINGS \ + { XLOG_STATE_ACTIVE, "XLOG_STATE_ACTIVE" }, \ + { XLOG_STATE_WANT_SYNC, "XLOG_STATE_WANT_SYNC" }, \ + { XLOG_STATE_SYNCING, "XLOG_STATE_SYNCING" }, \ + { XLOG_STATE_DONE_SYNC, "XLOG_STATE_DONE_SYNC" }, \ + { XLOG_STATE_CALLBACK, "XLOG_STATE_CALLBACK" }, \ + { XLOG_STATE_DIRTY, "XLOG_STATE_DIRTY" }, \ + { XLOG_STATE_IOERROR, "XLOG_STATE_IOERROR" } + + /* * Log ticket flags */ diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 4bce2fc9cbdc..d025b7df7158 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -24,6 +24,7 @@ struct xlog_ticket; struct xlog_recover; struct xlog_recover_item; struct xlog_rec_header; +struct xlog_in_core; struct xfs_buf_log_format; struct xfs_inode_log_format; struct xfs_bmbt_irec; @@ -3909,6 +3910,65 @@ DEFINE_EVENT(xfs_icwalk_class, name, \ DEFINE_ICWALK_EVENT(xfs_ioc_free_eofblocks); DEFINE_ICWALK_EVENT(xfs_blockgc_free_space);
+TRACE_DEFINE_ENUM(XLOG_STATE_ACTIVE); +TRACE_DEFINE_ENUM(XLOG_STATE_WANT_SYNC); +TRACE_DEFINE_ENUM(XLOG_STATE_SYNCING); +TRACE_DEFINE_ENUM(XLOG_STATE_DONE_SYNC); +TRACE_DEFINE_ENUM(XLOG_STATE_CALLBACK); +TRACE_DEFINE_ENUM(XLOG_STATE_DIRTY); +TRACE_DEFINE_ENUM(XLOG_STATE_IOERROR); + +DECLARE_EVENT_CLASS(xlog_iclog_class, + TP_PROTO(struct xlog_in_core *iclog, unsigned long caller_ip), + TP_ARGS(iclog, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(uint32_t, state) + __field(int32_t, refcount) + __field(uint32_t, offset) + __field(unsigned long long, lsn) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = iclog->ic_log->l_mp->m_super->s_dev; + __entry->state = iclog->ic_state; + __entry->refcount = atomic_read(&iclog->ic_refcnt); + __entry->offset = iclog->ic_offset; + __entry->lsn = be64_to_cpu(iclog->ic_header.h_lsn); + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d state %s refcnt %d offset %u lsn 0x%llx caller %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->state, XLOG_STATE_STRINGS), + __entry->refcount, + __entry->offset, + __entry->lsn, + (char *)__entry->caller_ip) + +); + +#define DEFINE_ICLOG_EVENT(name) \ +DEFINE_EVENT(xlog_iclog_class, name, \ + TP_PROTO(struct xlog_in_core *iclog, unsigned long caller_ip), \ + TP_ARGS(iclog, caller_ip)) + +DEFINE_ICLOG_EVENT(xlog_iclog_activate); +DEFINE_ICLOG_EVENT(xlog_iclog_clean); +DEFINE_ICLOG_EVENT(xlog_iclog_callback); +DEFINE_ICLOG_EVENT(xlog_iclog_callbacks_start); +DEFINE_ICLOG_EVENT(xlog_iclog_callbacks_done); +DEFINE_ICLOG_EVENT(xlog_iclog_force); +DEFINE_ICLOG_EVENT(xlog_iclog_force_lsn); +DEFINE_ICLOG_EVENT(xlog_iclog_get_space); +DEFINE_ICLOG_EVENT(xlog_iclog_release); +DEFINE_ICLOG_EVENT(xlog_iclog_switch); +DEFINE_ICLOG_EVENT(xlog_iclog_sync); +DEFINE_ICLOG_EVENT(xlog_iclog_syncing); +DEFINE_ICLOG_EVENT(xlog_iclog_sync_done); +DEFINE_ICLOG_EVENT(xlog_iclog_want_sync); +DEFINE_ICLOG_EVENT(xlog_iclog_wait_on); +DEFINE_ICLOG_EVENT(xlog_iclog_write); + #endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.13-rc4 commit ff7bebeb91f8cc2e26e7dabbf301da5ec0e9328c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Hoist the code in xfs_iget_cache_hit that restores the VFS inode state to an xfs_inode that was previously vfs-destroyed. The next patch will add a new set of state flags, so we need the helper to avoid duplication.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 143 +++++++++++++++++++++++++------------------- fs/xfs/xfs_trace.h | 4 +- 2 files changed, 83 insertions(+), 64 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 84f408fabb95..5fcda72f5367 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -354,14 +354,14 @@ xfs_reinit_inode( struct xfs_mount *mp, struct inode *inode) { - int error; - uint32_t nlink = inode->i_nlink; - uint32_t generation = inode->i_generation; - uint64_t version = inode_peek_iversion(inode); - umode_t mode = inode->i_mode; - dev_t dev = inode->i_rdev; - kuid_t uid = inode->i_uid; - kgid_t gid = inode->i_gid; + int error; + uint32_t nlink = inode->i_nlink; + uint32_t generation = inode->i_generation; + uint64_t version = inode_peek_iversion(inode); + umode_t mode = inode->i_mode; + dev_t dev = inode->i_rdev; + kuid_t uid = inode->i_uid; + kgid_t gid = inode->i_gid;
error = inode_init_always(mp->m_super, inode);
@@ -375,6 +375,74 @@ xfs_reinit_inode( return error; }
+/* + * Carefully nudge an inode whose VFS state has been torn down back into a + * usable state. Drops the i_flags_lock and the rcu read lock. + */ +static int +xfs_iget_recycle( + struct xfs_perag *pag, + struct xfs_inode *ip) __releases(&ip->i_flags_lock) +{ + struct xfs_mount *mp = ip->i_mount; + struct inode *inode = VFS_I(ip); + int error; + + trace_xfs_iget_recycle(ip); + + /* + * We need to make it look like the inode is being reclaimed to prevent + * the actual reclaim workers from stomping over us while we recycle + * the inode. We can't clear the radix tree tag yet as it requires + * pag_ici_lock to be held exclusive. + */ + ip->i_flags |= XFS_IRECLAIM; + + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + + ASSERT(!rwsem_is_locked(&inode->i_rwsem)); + error = xfs_reinit_inode(mp, inode); + if (error) { + bool wake; + + /* + * Re-initializing the inode failed, and we are in deep + * trouble. Try to re-add it to the reclaim list. + */ + rcu_read_lock(); + spin_lock(&ip->i_flags_lock); + wake = !!__xfs_iflags_test(ip, XFS_INEW); + ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); + if (wake) + wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); + ASSERT(ip->i_flags & XFS_IRECLAIMABLE); + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + + trace_xfs_iget_recycle_fail(ip); + return error; + } + + spin_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + + /* + * Clear the per-lifetime state in the inode as we are now effectively + * a new inode and need to return to the initial state before reuse + * occurs. + */ + ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; + ip->i_flags |= XFS_INEW; + xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + inode->i_state = I_NEW; + spin_unlock(&ip->i_flags_lock); + spin_unlock(&pag->pag_ici_lock); + + return 0; +} + /* * If we are allocating a new inode, then check what was returned is * actually a free, empty inode. If we are not allocating an inode, @@ -449,7 +517,7 @@ xfs_iget_cache_hit( /* * If we are racing with another cache hit that is currently * instantiating this inode or currently recycling it out of - * reclaimabe state, wait for the initialisation to complete + * reclaimable state, wait for the initialisation to complete * before continuing. * * XXX(hch): eventually we should do something equivalent to @@ -471,64 +539,16 @@ xfs_iget_cache_hit( if (error) goto out_error;
- /* - * If IRECLAIMABLE is set, we've torn down the VFS inode already. - * Need to carefully get it back into useable state. - */ if (ip->i_flags & XFS_IRECLAIMABLE) { - trace_xfs_iget_reclaim(ip); - if (flags & XFS_IGET_INCORE) { error = -EAGAIN; goto out_error; }
- /* - * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode - * from stomping over us while we recycle the inode. We can't - * clear the radix tree reclaimable tag yet as it requires - * pag_ici_lock to be held exclusive. - */ - ip->i_flags |= XFS_IRECLAIM; - - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); - - ASSERT(!rwsem_is_locked(&inode->i_rwsem)); - error = xfs_reinit_inode(mp, inode); - if (error) { - bool wake; - /* - * Re-initializing the inode failed, and we are in deep - * trouble. Try to re-add it to the reclaim list. - */ - rcu_read_lock(); - spin_lock(&ip->i_flags_lock); - wake = !!__xfs_iflags_test(ip, XFS_INEW); - ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); - if (wake) - wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); - ASSERT(ip->i_flags & XFS_IRECLAIMABLE); - trace_xfs_iget_reclaim_fail(ip); - goto out_error; - } - - spin_lock(&pag->pag_ici_lock); - spin_lock(&ip->i_flags_lock); - - /* - * Clear the per-lifetime state in the inode as we are now - * effectively a new inode and need to return to the initial - * state before reuse occurs. - */ - ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; - ip->i_flags |= XFS_INEW; - xfs_perag_clear_inode_tag(pag, - XFS_INO_TO_AGINO(pag->pag_mount, ino), - XFS_ICI_RECLAIM_TAG); - inode->i_state = I_NEW; - spin_unlock(&ip->i_flags_lock); - spin_unlock(&pag->pag_ici_lock); + /* Drops i_flags_lock and RCU read lock. */ + error = xfs_iget_recycle(pag, ip); + if (error) + return error; } else { /* If the VFS inode is being torn down, pause and try again. */ if (!igrab(inode)) { @@ -558,7 +578,6 @@ xfs_iget_cache_hit( return error; }
- static int xfs_iget_cache_miss( struct xfs_mount *mp, diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index d025b7df7158..8fa6d8bb4312 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -613,8 +613,8 @@ DEFINE_EVENT(xfs_inode_class, name, \ TP_PROTO(struct xfs_inode *ip), \ TP_ARGS(ip)) DEFINE_INODE_EVENT(xfs_iget_skip); -DEFINE_INODE_EVENT(xfs_iget_reclaim); -DEFINE_INODE_EVENT(xfs_iget_reclaim_fail); +DEFINE_INODE_EVENT(xfs_iget_recycle); +DEFINE_INODE_EVENT(xfs_iget_recycle_fail); DEFINE_INODE_EVENT(xfs_iget_hit); DEFINE_INODE_EVENT(xfs_iget_miss);
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.13-rc4 commit 77b4d2861e8381d00e4b9bd1be2a355dda99ff60 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
During review of the v6 deferred inode inactivation patchset[1], Dave commented that _cache_hit should have a clear separation between inode selection criteria and actions performed on a selected inode. Move a hunk to make this true, and compact the shrink cases in the function.
[1] https://lore.kernel.org/linux-xfs/162310469340.3465262.504398465311182657.st... Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 39 ++++++++++++++++----------------------- 1 file changed, 16 insertions(+), 23 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 5fcda72f5367..562b6523afff 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -506,13 +506,8 @@ xfs_iget_cache_hit( * will not match, so check for that, too. */ spin_lock(&ip->i_flags_lock); - if (ip->i_ino != ino) { - trace_xfs_iget_skip(ip); - XFS_STATS_INC(mp, xs_ig_frecycle); - error = -EAGAIN; - goto out_error; - } - + if (ip->i_ino != ino) + goto out_skip;
/* * If we are racing with another cache hit that is currently @@ -524,12 +519,8 @@ xfs_iget_cache_hit( * wait_on_inode to wait for these flags to be cleared * instead of polling for it. */ - if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { - trace_xfs_iget_skip(ip); - XFS_STATS_INC(mp, xs_ig_frecycle); - error = -EAGAIN; - goto out_error; - } + if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM)) + goto out_skip;
/* * Check the inode free state is valid. This also detects lookup @@ -539,23 +530,21 @@ xfs_iget_cache_hit( if (error) goto out_error;
- if (ip->i_flags & XFS_IRECLAIMABLE) { - if (flags & XFS_IGET_INCORE) { - error = -EAGAIN; - goto out_error; - } + /* Skip inodes that have no vfs state. */ + if ((flags & XFS_IGET_INCORE) && + (ip->i_flags & XFS_IRECLAIMABLE)) + goto out_skip;
+ /* The inode fits the selection criteria; process it. */ + if (ip->i_flags & XFS_IRECLAIMABLE) { /* Drops i_flags_lock and RCU read lock. */ error = xfs_iget_recycle(pag, ip); if (error) return error; } else { /* If the VFS inode is being torn down, pause and try again. */ - if (!igrab(inode)) { - trace_xfs_iget_skip(ip); - error = -EAGAIN; - goto out_error; - } + if (!igrab(inode)) + goto out_skip;
/* We've got a live one. */ spin_unlock(&ip->i_flags_lock); @@ -572,6 +561,10 @@ xfs_iget_cache_hit(
return 0;
+out_skip: + trace_xfs_iget_skip(ip); + XFS_STATS_INC(mp, xs_ig_frecycle); + error = -EAGAIN; out_error: spin_unlock(&ip->i_flags_lock); rcu_read_unlock();
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.12-rc2 commit d336f7ebc65007f5831e2297e6f3383ae8dbf8ed category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
If we allocate quota inodes in the process of mounting a filesystem but then decide to abort the mount, it's possible that the quota inodes are sitting around pinned by the log. Now that inode reclaim relies on the AIL to flush inodes, we have to force the log and push the AIL in between releasing the quota inodes and kicking off reclaim to tear down all the incore inodes. Do this by extracting the bits we need from the unmount path and reusing them. As an added bonus, failed writes during a failed mount will not retry forever now.
This was originally found during a fuzz test of metadata directories (xfs/1546), but the actual symptom was that reclaim hung up on the quota inodes.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_mount.c | 90 +++++++++++++++++++++++----------------------- 1 file changed, 44 insertions(+), 46 deletions(-)
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 6845d9d432b4..d7ce93e16cb8 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -634,6 +634,47 @@ xfs_check_summary_counts( return xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount); }
+/* + * Flush and reclaim dirty inodes in preparation for unmount. Inodes and + * internal inode structures can be sitting in the CIL and AIL at this point, + * so we need to unpin them, write them back and/or reclaim them before unmount + * can proceed. + * + * An inode cluster that has been freed can have its buffer still pinned in + * memory because the transaction is still sitting in a iclog. The stale inodes + * on that buffer will be pinned to the buffer until the transaction hits the + * disk and the callbacks run. Pushing the AIL will skip the stale inodes and + * may never see the pinned buffer, so nothing will push out the iclog and + * unpin the buffer. + * + * Hence we need to force the log to unpin everything first. However, log + * forces don't wait for the discards they issue to complete, so we have to + * explicitly wait for them to complete here as well. + * + * Then we can tell the world we are unmounting so that error handling knows + * that the filesystem is going away and we should error out anything that we + * have been retrying in the background. This will prevent never-ending + * retries in AIL pushing from hanging the unmount. + * + * Finally, we can push the AIL to clean all the remaining dirty objects, then + * reclaim the remaining inodes that are still in memory at this point in time. + */ +static void +xfs_unmount_flush_inodes( + struct xfs_mount *mp) +{ + xfs_log_force(mp, XFS_LOG_SYNC); + xfs_extent_busy_wait_all(mp); + flush_workqueue(xfs_discard_wq); + + mp->m_flags |= XFS_MOUNT_UNMOUNTING; + + xfs_ail_push_all_sync(mp->m_ail); + cancel_delayed_work_sync(&mp->m_reclaim_work); + xfs_reclaim_inodes(mp); + xfs_health_unmount(mp); +} + /* * This function does the following on an initial mount of a file system: * - reads the superblock from disk and init the mount struct @@ -1016,7 +1057,7 @@ xfs_mountfs( /* Clean out dquots that might be in memory after quotacheck. */ xfs_qm_unmount(mp); /* - * Cancel all delayed reclaim work and reclaim the inodes directly. + * Flush all inode reclamation work and flush the log. * We have to do this /after/ rtunmount and qm_unmount because those * two will have scheduled delayed reclaim for the rt/quota inodes. * @@ -1026,11 +1067,8 @@ xfs_mountfs( * qm_unmount_quotas and therefore rely on qm_unmount to release the * quota inodes. */ - cancel_delayed_work_sync(&mp->m_reclaim_work); - xfs_reclaim_inodes(mp); - xfs_health_unmount(mp); + xfs_unmount_flush_inodes(mp); out_log_dealloc: - mp->m_flags |= XFS_MOUNT_UNMOUNTING; xfs_log_mount_cancel(mp); out_fail_wait: if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) @@ -1071,47 +1109,7 @@ xfs_unmountfs( xfs_rtunmount_inodes(mp); xfs_irele(mp->m_rootip);
- /* - * We can potentially deadlock here if we have an inode cluster - * that has been freed has its buffer still pinned in memory because - * the transaction is still sitting in a iclog. The stale inodes - * on that buffer will be pinned to the buffer until the - * transaction hits the disk and the callbacks run. Pushing the AIL will - * skip the stale inodes and may never see the pinned buffer, so - * nothing will push out the iclog and unpin the buffer. Hence we - * need to force the log here to ensure all items are flushed into the - * AIL before we go any further. - */ - xfs_log_force(mp, XFS_LOG_SYNC); - - /* - * Wait for all busy extents to be freed, including completion of - * any discard operation. - */ - xfs_extent_busy_wait_all(mp); - flush_workqueue(xfs_discard_wq); - - /* - * We now need to tell the world we are unmounting. This will allow - * us to detect that the filesystem is going away and we should error - * out anything that we have been retrying in the background. This will - * prevent neverending retries in AIL pushing from hanging the unmount. - */ - mp->m_flags |= XFS_MOUNT_UNMOUNTING; - - /* - * Flush all pending changes from the AIL. - */ - xfs_ail_push_all_sync(mp->m_ail); - - /* - * Reclaim all inodes. At this point there should be no dirty inodes and - * none should be pinned or locked. Stop background inode reclaim here - * if it is still running. - */ - cancel_delayed_work_sync(&mp->m_reclaim_work); - xfs_reclaim_inodes(mp); - xfs_health_unmount(mp); + xfs_unmount_flush_inodes(mp);
xfs_qm_unmount(mp);
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit 9febcda6f8d1db9f922945d026bb838864b1b6d5 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Brian Foster reported a lockdep warning on xfs/167: Reviewed-by: Zhang Yi yi.zhang@huawei.com
============================================ WARNING: possible recursive locking detected 5.11.0-rc4 #35 Tainted: G W I -------------------------------------------- fsstress/17733 is trying to acquire lock: ffff8e0fd1d90650 (sb_internal){++++}-{0:0}, at: xfs_free_eofblocks+0x104/0x1d0 [xfs]
but task is already holding lock: ffff8e0fd1d90650 (sb_internal){++++}-{0:0}, at: xfs_trans_alloc_inode+0x5f/0x160 [xfs]
stack backtrace: CPU: 38 PID: 17733 Comm: fsstress Tainted: G W I 5.11.0-rc4 #35 Hardware name: Dell Inc. PowerEdge R740/01KPX8, BIOS 1.6.11 11/20/2018 Call Trace: dump_stack+0x8b/0xb0 __lock_acquire.cold+0x159/0x2ab lock_acquire+0x116/0x370 xfs_trans_alloc+0x1ad/0x310 [xfs] xfs_free_eofblocks+0x104/0x1d0 [xfs] xfs_blockgc_scan_inode+0x24/0x60 [xfs] xfs_inode_walk_ag+0x202/0x4b0 [xfs] xfs_inode_walk+0x66/0xc0 [xfs] xfs_trans_alloc+0x160/0x310 [xfs] xfs_trans_alloc_inode+0x5f/0x160 [xfs] xfs_alloc_file_space+0x105/0x300 [xfs] xfs_file_fallocate+0x270/0x460 [xfs] vfs_fallocate+0x14d/0x3d0 __x64_sys_fallocate+0x3e/0x70 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9
The cause of this is the new code that spurs a scan to garbage collect speculative preallocations if we fail to reserve enough blocks while allocating a transaction. While the warning itself is a fairly benign lockdep complaint, it does expose a potential livelock if the rwsem behavior ever changes with regards to nesting read locks when someone's waiting for a write lock.
Fix this by freeing the transaction and jumping back to xfs_trans_alloc like this patch in the V4 submission[1].
[1] https://lore.kernel.org/linux-xfs/161142798066.2171939.9311024588681972086.s...
Fixes: a1a7d05a0576 ("xfs: flush speculative space allocations when we run out of space") Reported-by: Brian Foster bfoster@redhat.com Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Allison Henderson allison.henderson@oracle.com Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Lihong Kou koulihong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_trans.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-)
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 3a628c2ea8ac..b2d74a22b231 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -260,6 +260,7 @@ xfs_trans_alloc( struct xfs_trans **tpp) { struct xfs_trans *tp; + bool want_retry = true; int error;
/* @@ -267,6 +268,7 @@ xfs_trans_alloc( * GFP_NOFS allocation context so that we avoid lockdep false positives * by doing GFP_KERNEL allocations inside sb_start_intwrite(). */ +retry: tp = kmem_cache_zalloc(xfs_trans_zone, GFP_KERNEL | __GFP_NOFAIL); if (!(flags & XFS_TRANS_NO_WRITECOUNT)) sb_start_intwrite(mp->m_super); @@ -289,7 +291,9 @@ xfs_trans_alloc( tp->t_firstblock = NULLFSBLOCK;
error = xfs_trans_reserve(tp, resp, blocks, rtextents); - if (error == -ENOSPC) { + if (error == -ENOSPC && want_retry) { + xfs_trans_cancel(tp); + /* * We weren't able to reserve enough space for the transaction. * Flush the other speculative space allocations to free space. @@ -297,8 +301,11 @@ xfs_trans_alloc( * other locks. */ error = xfs_blockgc_free_space(mp, NULL); - if (!error) - error = xfs_trans_reserve(tp, resp, blocks, rtextents); + if (error) + return error; + + want_retry = false; + goto retry; } if (error) { xfs_trans_cancel(tp);
From: Christoph Hellwig hch@lst.de
mainline-inclusion from mainline-v5.14-rc4 commit 40b52225e58cd3adf9358146b4b39dccfbfe5892 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Disabling quota accounting is hairy, racy code with all kinds of pitfalls. And it has a very strange mind set, as quota accounting (unlike enforcement) really is a propery of the on-disk format. There is no good use case for supporting this.
Signed-off-by: Christoph Hellwig hch@lst.de Reviewed-by: Carlos Maiolino cmaiolino@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/libxfs/xfs_trans_resv.c | 30 ---- fs/xfs/libxfs/xfs_trans_resv.h | 2 - fs/xfs/xfs_dquot_item.c | 134 ------------------ fs/xfs/xfs_dquot_item.h | 17 --- fs/xfs/xfs_qm.c | 2 +- fs/xfs/xfs_qm.h | 3 - fs/xfs/xfs_qm_syscalls.c | 241 ++------------------------------- fs/xfs/xfs_trans_dquot.c | 38 ------ 8 files changed, 13 insertions(+), 454 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index d1a0848cb52e..ce12c8142bd1 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -798,29 +798,6 @@ xfs_calc_qm_dqalloc_reservation( XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1); }
-/* - * Turning off quotas. - * the quota off logitems: sizeof(struct xfs_qoff_logitem) * 2 - * the superblock for the quota flags: sector size - */ -STATIC uint -xfs_calc_qm_quotaoff_reservation( - struct xfs_mount *mp) -{ - return sizeof(struct xfs_qoff_logitem) * 2 + - xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); -} - -/* - * End of turning off quotas. - * the quota off logitems: sizeof(struct xfs_qoff_logitem) * 2 - */ -STATIC uint -xfs_calc_qm_quotaoff_end_reservation(void) -{ - return sizeof(struct xfs_qoff_logitem) * 2; -} - /* * Syncing the incore super block changes to disk. * the super block to reflect the changes: sector size @@ -923,13 +900,6 @@ xfs_trans_resv_calc( resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(); resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT;
- resp->tr_qm_quotaoff.tr_logres = xfs_calc_qm_quotaoff_reservation(mp); - resp->tr_qm_quotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT; - - resp->tr_qm_equotaoff.tr_logres = - xfs_calc_qm_quotaoff_end_reservation(); - resp->tr_qm_equotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT; - resp->tr_sb.tr_logres = xfs_calc_sb_reservation(mp); resp->tr_sb.tr_logcount = XFS_DEFAULT_LOG_COUNT;
diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index 7241ab28cf84..fc4e9b369a3a 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h @@ -46,8 +46,6 @@ struct xfs_trans_resv { struct xfs_trans_res tr_growrtfree; /* grow realtime freeing */ struct xfs_trans_res tr_qm_setqlim; /* adjust quota limits */ struct xfs_trans_res tr_qm_dqalloc; /* allocate quota on disk */ - struct xfs_trans_res tr_qm_quotaoff; /* turn quota off */ - struct xfs_trans_res tr_qm_equotaoff;/* end of turn quota off */ struct xfs_trans_res tr_sb; /* modify superblock */ struct xfs_trans_res tr_fsyncts; /* update timestamps on fsync */ }; diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 8ed47b739b6c..6a1aae799cf1 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -218,137 +218,3 @@ xfs_qm_dquot_logitem_init( &xfs_dquot_item_ops); lp->qli_dquot = dqp; } - -/*------------------ QUOTAOFF LOG ITEMS -------------------*/ - -static inline struct xfs_qoff_logitem *QOFF_ITEM(struct xfs_log_item *lip) -{ - return container_of(lip, struct xfs_qoff_logitem, qql_item); -} - - -/* - * This returns the number of iovecs needed to log the given quotaoff item. - * We only need 1 iovec for an quotaoff item. It just logs the - * quotaoff_log_format structure. - */ -STATIC void -xfs_qm_qoff_logitem_size( - struct xfs_log_item *lip, - int *nvecs, - int *nbytes) -{ - *nvecs += 1; - *nbytes += sizeof(struct xfs_qoff_logitem); -} - -STATIC void -xfs_qm_qoff_logitem_format( - struct xfs_log_item *lip, - struct xfs_log_vec *lv) -{ - struct xfs_qoff_logitem *qflip = QOFF_ITEM(lip); - struct xfs_log_iovec *vecp = NULL; - struct xfs_qoff_logformat *qlf; - - qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QUOTAOFF); - qlf->qf_type = XFS_LI_QUOTAOFF; - qlf->qf_size = 1; - qlf->qf_flags = qflip->qql_flags; - xlog_finish_iovec(lv, vecp, sizeof(struct xfs_qoff_logitem)); -} - -/* - * There isn't much you can do to push a quotaoff item. It is simply - * stuck waiting for the log to be flushed to disk. - */ -STATIC uint -xfs_qm_qoff_logitem_push( - struct xfs_log_item *lip, - struct list_head *buffer_list) -{ - return XFS_ITEM_LOCKED; -} - -STATIC xfs_lsn_t -xfs_qm_qoffend_logitem_committed( - struct xfs_log_item *lip, - xfs_lsn_t lsn) -{ - struct xfs_qoff_logitem *qfe = QOFF_ITEM(lip); - struct xfs_qoff_logitem *qfs = qfe->qql_start_lip; - - xfs_qm_qoff_logitem_relse(qfs); - - kmem_free(lip->li_lv_shadow); - kmem_free(qfe); - return (xfs_lsn_t)-1; -} - -STATIC void -xfs_qm_qoff_logitem_release( - struct xfs_log_item *lip) -{ - struct xfs_qoff_logitem *qoff = QOFF_ITEM(lip); - - if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) { - if (qoff->qql_start_lip) - xfs_qm_qoff_logitem_relse(qoff->qql_start_lip); - xfs_qm_qoff_logitem_relse(qoff); - } -} - -static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { - .iop_size = xfs_qm_qoff_logitem_size, - .iop_format = xfs_qm_qoff_logitem_format, - .iop_committed = xfs_qm_qoffend_logitem_committed, - .iop_push = xfs_qm_qoff_logitem_push, - .iop_release = xfs_qm_qoff_logitem_release, -}; - -static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = { - .iop_size = xfs_qm_qoff_logitem_size, - .iop_format = xfs_qm_qoff_logitem_format, - .iop_push = xfs_qm_qoff_logitem_push, - .iop_release = xfs_qm_qoff_logitem_release, -}; - -/* - * Delete the quotaoff intent from the AIL and free it. On success, - * this should only be called for the start item. It can be used for - * either on shutdown or abort. - */ -void -xfs_qm_qoff_logitem_relse( - struct xfs_qoff_logitem *qoff) -{ - struct xfs_log_item *lip = &qoff->qql_item; - - ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags) || - test_bit(XFS_LI_ABORTED, &lip->li_flags) || - XFS_FORCED_SHUTDOWN(lip->li_mountp)); - xfs_trans_ail_delete(lip, 0); - kmem_free(lip->li_lv_shadow); - kmem_free(qoff); -} - -/* - * Allocate and initialize an quotaoff item of the correct quota type(s). - */ -struct xfs_qoff_logitem * -xfs_qm_qoff_logitem_init( - struct xfs_mount *mp, - struct xfs_qoff_logitem *start, - uint flags) -{ - struct xfs_qoff_logitem *qf; - - qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), 0); - - xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ? - &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops); - qf->qql_item.li_mountp = mp; - qf->qql_start_lip = start; - qf->qql_flags = flags; - return qf; -} diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h index 2b86a43d7ce2..794710c24474 100644 --- a/fs/xfs/xfs_dquot_item.h +++ b/fs/xfs/xfs_dquot_item.h @@ -9,7 +9,6 @@ struct xfs_dquot; struct xfs_trans; struct xfs_mount; -struct xfs_qoff_logitem;
struct xfs_dq_logitem { struct xfs_log_item qli_item; /* common portion */ @@ -17,22 +16,6 @@ struct xfs_dq_logitem { xfs_lsn_t qli_flush_lsn; /* lsn at last flush */ };
-struct xfs_qoff_logitem { - struct xfs_log_item qql_item; /* common portion */ - struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */ - unsigned int qql_flags; -}; - - void xfs_qm_dquot_logitem_init(struct xfs_dquot *dqp); -struct xfs_qoff_logitem *xfs_qm_qoff_logitem_init(struct xfs_mount *mp, - struct xfs_qoff_logitem *start, - uint flags); -void xfs_qm_qoff_logitem_relse(struct xfs_qoff_logitem *); -struct xfs_qoff_logitem *xfs_trans_get_qoff_item(struct xfs_trans *tp, - struct xfs_qoff_logitem *startqoff, - uint flags); -void xfs_trans_log_quotaoff_item(struct xfs_trans *tp, - struct xfs_qoff_logitem *qlp);
#endif /* __XFS_DQUOT_ITEM_H__ */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index c660db6d5dde..dfee29e39d8e 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -183,7 +183,7 @@ xfs_qm_dqpurge( /* * Purge the dquot cache. */ -void +static void xfs_qm_dqpurge_all( struct xfs_mount *mp, uint flags) diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index ebbb484c49dc..442a0f97a9d4 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -140,9 +140,6 @@ struct xfs_dquot_acct {
extern void xfs_qm_destroy_quotainfo(struct xfs_mount *);
-/* dquot stuff */ -extern void xfs_qm_dqpurge_all(struct xfs_mount *, uint); - /* quota ops */ extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint); extern int xfs_qm_scall_getquota(struct xfs_mount *mp, diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 24a04b8614a7..473b2458c6ca 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -19,91 +19,11 @@ #include "xfs_qm.h" #include "xfs_icache.h"
-STATIC int -xfs_qm_log_quotaoff( - struct xfs_mount *mp, - struct xfs_qoff_logitem **qoffstartp, - uint flags) -{ - struct xfs_trans *tp; - int error; - struct xfs_qoff_logitem *qoffi; - - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_quotaoff, 0, 0, 0, &tp); - if (error) - goto out; - - qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT); - xfs_trans_log_quotaoff_item(tp, qoffi); - - spin_lock(&mp->m_sb_lock); - mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL; - spin_unlock(&mp->m_sb_lock); - - xfs_log_sb(tp); - - /* - * We have to make sure that the transaction is secure on disk before we - * return and actually stop quota accounting. So, make it synchronous. - * We don't care about quotoff's performance. - */ - xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp); - if (error) - goto out; - - *qoffstartp = qoffi; -out: - return error; -} - -STATIC int -xfs_qm_log_quotaoff_end( - struct xfs_mount *mp, - struct xfs_qoff_logitem **startqoff, - uint flags) -{ - struct xfs_trans *tp; - int error; - struct xfs_qoff_logitem *qoffi; - - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_equotaoff, 0, 0, 0, &tp); - if (error) - return error; - - qoffi = xfs_trans_get_qoff_item(tp, *startqoff, - flags & XFS_ALL_QUOTA_ACCT); - xfs_trans_log_quotaoff_item(tp, qoffi); - *startqoff = NULL; - - /* - * We have to make sure that the transaction is secure on disk before we - * return and actually stop quota accounting. So, make it synchronous. - * We don't care about quotoff's performance. - */ - xfs_trans_set_sync(tp); - return xfs_trans_commit(tp); -} - -/* - * Turn off quota accounting and/or enforcement for all udquots and/or - * gdquots. Called only at unmount time. - * - * This assumes that there are no dquots of this file system cached - * incore, and modifies the ondisk dquot directly. Therefore, for example, - * it is an error to call this twice, without purging the cache. - */ int xfs_qm_scall_quotaoff( xfs_mount_t *mp, uint flags) { - struct xfs_quotainfo *q = mp->m_quotainfo; - uint dqtype; - int error; - uint inactivate_flags; - struct xfs_qoff_logitem *qoffstart = NULL; - /* * No file system can have quotas enabled on disk but not in core. * Note that quota utilities (like quotaoff) _expect_ @@ -111,160 +31,23 @@ xfs_qm_scall_quotaoff( */ if ((mp->m_qflags & flags) == 0) return -EEXIST; - error = 0; - - flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
/* - * We don't want to deal with two quotaoffs messing up each other, - * so we're going to serialize it. quotaoff isn't exactly a performance - * critical thing. - * If quotaoff, then we must be dealing with the root filesystem. + * We do not support actually turning off quota accounting any more. + * Just log a warning and ignore the accounting related flags. */ - ASSERT(q); - mutex_lock(&q->qi_quotaofflock); + if (flags & XFS_ALL_QUOTA_ACCT) + xfs_info(mp, "disabling of quota accounting not supported.");
- /* - * If we're just turning off quota enforcement, change mp and go. - */ - if ((flags & XFS_ALL_QUOTA_ACCT) == 0) { - mp->m_qflags &= ~(flags); - - spin_lock(&mp->m_sb_lock); - mp->m_sb.sb_qflags = mp->m_qflags; - spin_unlock(&mp->m_sb_lock); - mutex_unlock(&q->qi_quotaofflock); - - /* XXX what to do if error ? Revert back to old vals incore ? */ - return xfs_sync_sb(mp, false); - } - - dqtype = 0; - inactivate_flags = 0; - /* - * If accounting is off, we must turn enforcement off, clear the - * quota 'CHKD' certificate to make it known that we have to - * do a quotacheck the next time this quota is turned on. - */ - if (flags & XFS_UQUOTA_ACCT) { - dqtype |= XFS_QMOPT_UQUOTA; - flags |= (XFS_UQUOTA_CHKD | XFS_UQUOTA_ENFD); - inactivate_flags |= XFS_UQUOTA_ACTIVE; - } - if (flags & XFS_GQUOTA_ACCT) { - dqtype |= XFS_QMOPT_GQUOTA; - flags |= (XFS_GQUOTA_CHKD | XFS_GQUOTA_ENFD); - inactivate_flags |= XFS_GQUOTA_ACTIVE; - } - if (flags & XFS_PQUOTA_ACCT) { - dqtype |= XFS_QMOPT_PQUOTA; - flags |= (XFS_PQUOTA_CHKD | XFS_PQUOTA_ENFD); - inactivate_flags |= XFS_PQUOTA_ACTIVE; - } - - /* - * Nothing to do? Don't complain. This happens when we're just - * turning off quota enforcement. - */ - if ((mp->m_qflags & flags) == 0) - goto out_unlock; - - /* - * Write the LI_QUOTAOFF log record, and do SB changes atomically, - * and synchronously. If we fail to write, we should abort the - * operation as it cannot be recovered safely if we crash. - */ - error = xfs_qm_log_quotaoff(mp, &qoffstart, flags); - if (error) - goto out_unlock; - - /* - * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct - * to take care of the race between dqget and quotaoff. We don't take - * any special locks to reset these bits. All processes need to check - * these bits *after* taking inode lock(s) to see if the particular - * quota type is in the process of being turned off. If *ACTIVE, it is - * guaranteed that all dquot structures and all quotainode ptrs will all - * stay valid as long as that inode is kept locked. - * - * There is no turning back after this. - */ - mp->m_qflags &= ~inactivate_flags; - - /* - * Give back all the dquot reference(s) held by inodes. - * Here we go thru every single incore inode in this file system, and - * do a dqrele on the i_udquot/i_gdquot that it may have. - * Essentially, as long as somebody has an inode locked, this guarantees - * that quotas will not be turned off. This is handy because in a - * transaction once we lock the inode(s) and check for quotaon, we can - * depend on the quota inodes (and other things) being valid as long as - * we keep the lock(s). - */ - error = xfs_dqrele_all_inodes(mp, flags); - ASSERT(!error); - - /* - * Next we make the changes in the quota flag in the mount struct. - * This isn't protected by a particular lock directly, because we - * don't want to take a mrlock every time we depend on quotas being on. - */ - mp->m_qflags &= ~flags; - - /* - * Go through all the dquots of this file system and purge them, - * according to what was turned off. - */ - xfs_qm_dqpurge_all(mp, dqtype); - - /* - * Transactions that had started before ACTIVE state bit was cleared - * could have logged many dquots, so they'd have higher LSNs than - * the first QUOTAOFF log record does. If we happen to crash when - * the tail of the log has gone past the QUOTAOFF record, but - * before the last dquot modification, those dquots __will__ - * recover, and that's not good. - * - * So, we have QUOTAOFF start and end logitems; the start - * logitem won't get overwritten until the end logitem appears... - */ - error = xfs_qm_log_quotaoff_end(mp, &qoffstart, flags); - if (error) { - /* We're screwed now. Shutdown is the only option. */ - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - goto out_unlock; - } - - /* - * If all quotas are completely turned off, close shop. - */ - if (mp->m_qflags == 0) { - mutex_unlock(&q->qi_quotaofflock); - xfs_qm_destroy_quotainfo(mp); - return 0; - } - - /* - * Release our quotainode references if we don't need them anymore. - */ - if ((dqtype & XFS_QMOPT_UQUOTA) && q->qi_uquotaip) { - xfs_irele(q->qi_uquotaip); - q->qi_uquotaip = NULL; - } - if ((dqtype & XFS_QMOPT_GQUOTA) && q->qi_gquotaip) { - xfs_irele(q->qi_gquotaip); - q->qi_gquotaip = NULL; - } - if ((dqtype & XFS_QMOPT_PQUOTA) && q->qi_pquotaip) { - xfs_irele(q->qi_pquotaip); - q->qi_pquotaip = NULL; - } + mutex_lock(&mp->m_quotainfo->qi_quotaofflock); + mp->m_qflags &= ~(flags & XFS_ALL_QUOTA_ENFD); + spin_lock(&mp->m_sb_lock); + mp->m_sb.sb_qflags = mp->m_qflags; + spin_unlock(&mp->m_sb_lock); + mutex_unlock(&mp->m_quotainfo->qi_quotaofflock);
-out_unlock: - if (error && qoffstart) - xfs_qm_qoff_logitem_relse(qoffstart); - mutex_unlock(&q->qi_quotaofflock); - return error; + /* XXX what to do if error ? Revert back to old vals incore ? */ + return xfs_sync_sb(mp, false); }
STATIC int diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 52af4c7313eb..3a7b4ad862a8 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -864,44 +864,6 @@ xfs_trans_reserve_quota_icreate( dblocks, 1, XFS_QMOPT_RES_REGBLKS); }
-/* - * This routine is called to allocate a quotaoff log item. - */ -struct xfs_qoff_logitem * -xfs_trans_get_qoff_item( - struct xfs_trans *tp, - struct xfs_qoff_logitem *startqoff, - uint flags) -{ - struct xfs_qoff_logitem *q; - - ASSERT(tp != NULL); - - q = xfs_qm_qoff_logitem_init(tp->t_mountp, startqoff, flags); - ASSERT(q != NULL); - - /* - * Get a log_item_desc to point at the new item. - */ - xfs_trans_add_item(tp, &q->qql_item); - return q; -} - - -/* - * This is called to mark the quotaoff logitem as needing - * to be logged when the transaction is committed. The logitem must - * already be associated with the given transaction. - */ -void -xfs_trans_log_quotaoff_item( - struct xfs_trans *tp, - struct xfs_qoff_logitem *qlp) -{ - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &qlp->qql_item.li_flags); -} - STATIC void xfs_trans_alloc_dqinfo( xfs_trans_t *tp)
From: Christoph Hellwig hch@lst.de
mainline-inclusion from mainline-v5.14-rc4 commit 777eb1fa857ec38afd518b3adc25cfac0f4af13b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
xfs_dqrele_all_inodes is unused now, remove it and all supporting code.
Signed-off-by: Christoph Hellwig hch@lst.de Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 107 +------------------------------------------- fs/xfs/xfs_icache.h | 6 --- 2 files changed, 1 insertion(+), 112 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 562b6523afff..e3c89c511ee5 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -38,9 +38,6 @@ * radix tree tags when convenient. Avoid existing XFS_IWALK namespace. */ enum xfs_icwalk_goal { - /* Goals that are not related to tags; these must be < 0. */ - XFS_ICWALK_DQRELE = -1, - /* Goals directly associated with tagged inodes. */ XFS_ICWALK_BLOCKGC = XFS_ICI_BLOCKGC_TAG, XFS_ICWALK_RECLAIM = XFS_ICI_RECLAIM_TAG, @@ -64,9 +61,6 @@ static int xfs_icwalk_ag(struct xfs_perag *pag, * Private inode cache walk flags for struct xfs_icwalk. Must not * coincide with XFS_ICWALK_FLAGS_VALID. */ -#define XFS_ICWALK_FLAG_DROP_UDQUOT (1U << 31) -#define XFS_ICWALK_FLAG_DROP_GDQUOT (1U << 30) -#define XFS_ICWALK_FLAG_DROP_PDQUOT (1U << 29)
/* Stop scanning after icw_scan_limit inodes. */ #define XFS_ICWALK_FLAG_SCAN_LIMIT (1U << 28) @@ -74,10 +68,7 @@ static int xfs_icwalk_ag(struct xfs_perag *pag, #define XFS_ICWALK_FLAG_RECLAIM_SICK (1U << 27) #define XFS_ICWALK_FLAG_UNION (1U << 26) /* union filter algorithm */
-#define XFS_ICWALK_PRIVATE_FLAGS (XFS_ICWALK_FLAG_DROP_UDQUOT | \ - XFS_ICWALK_FLAG_DROP_GDQUOT | \ - XFS_ICWALK_FLAG_DROP_PDQUOT | \ - XFS_ICWALK_FLAG_SCAN_LIMIT | \ +#define XFS_ICWALK_PRIVATE_FLAGS (XFS_ICWALK_FLAG_SCAN_LIMIT | \ XFS_ICWALK_FLAG_RECLAIM_SICK | \ XFS_ICWALK_FLAG_UNION)
@@ -816,97 +807,6 @@ xfs_icache_inode_is_allocated( return 0; }
-#ifdef CONFIG_XFS_QUOTA -/* Decide if we want to grab this inode to drop its dquots. */ -static bool -xfs_dqrele_igrab( - struct xfs_inode *ip) -{ - bool ret = false; - - ASSERT(rcu_read_lock_held()); - - /* Check for stale RCU freed inode */ - spin_lock(&ip->i_flags_lock); - if (!ip->i_ino) - goto out_unlock; - - /* - * Skip inodes that are anywhere in the reclaim machinery because we - * drop dquots before tagging an inode for reclamation. - */ - if (ip->i_flags & (XFS_IRECLAIM | XFS_IRECLAIMABLE)) - goto out_unlock; - - /* - * The inode looks alive; try to grab a VFS reference so that it won't - * get destroyed. If we got the reference, return true to say that - * we grabbed the inode. - * - * If we can't get the reference, then we know the inode had its VFS - * state torn down and hasn't yet entered the reclaim machinery. Since - * we also know that dquots are detached from an inode before it enters - * reclaim, we can skip the inode. - */ - ret = igrab(VFS_I(ip)) != NULL; - -out_unlock: - spin_unlock(&ip->i_flags_lock); - return ret; -} - -/* Drop this inode's dquots. */ -static void -xfs_dqrele_inode( - struct xfs_inode *ip, - struct xfs_icwalk *icw) -{ - if (xfs_iflags_test(ip, XFS_INEW)) - xfs_inew_wait(ip); - - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (icw->icw_flags & XFS_ICWALK_FLAG_DROP_UDQUOT) { - xfs_qm_dqrele(ip->i_udquot); - ip->i_udquot = NULL; - } - if (icw->icw_flags & XFS_ICWALK_FLAG_DROP_GDQUOT) { - xfs_qm_dqrele(ip->i_gdquot); - ip->i_gdquot = NULL; - } - if (icw->icw_flags & XFS_ICWALK_FLAG_DROP_PDQUOT) { - xfs_qm_dqrele(ip->i_pdquot); - ip->i_pdquot = NULL; - } - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_irele(ip); -} - -/* - * Detach all dquots from incore inodes if we can. The caller must already - * have dropped the relevant XFS_[UGP]QUOTA_ACTIVE flags so that dquots will - * not get reattached. - */ -int -xfs_dqrele_all_inodes( - struct xfs_mount *mp, - unsigned int qflags) -{ - struct xfs_icwalk icw = { .icw_flags = 0 }; - - if (qflags & XFS_UQUOTA_ACCT) - icw.icw_flags |= XFS_ICWALK_FLAG_DROP_UDQUOT; - if (qflags & XFS_GQUOTA_ACCT) - icw.icw_flags |= XFS_ICWALK_FLAG_DROP_GDQUOT; - if (qflags & XFS_PQUOTA_ACCT) - icw.icw_flags |= XFS_ICWALK_FLAG_DROP_PDQUOT; - - return xfs_icwalk(mp, XFS_ICWALK_DQRELE, &icw); -} -#else -# define xfs_dqrele_igrab(ip) (false) -# define xfs_dqrele_inode(ip, priv) ((void)0) -#endif /* CONFIG_XFS_QUOTA */ - /* * Grab the inode for reclaim exclusively. * @@ -1655,8 +1555,6 @@ xfs_icwalk_igrab( struct xfs_icwalk *icw) { switch (goal) { - case XFS_ICWALK_DQRELE: - return xfs_dqrele_igrab(ip); case XFS_ICWALK_BLOCKGC: return xfs_blockgc_igrab(ip); case XFS_ICWALK_RECLAIM: @@ -1680,9 +1578,6 @@ xfs_icwalk_process_inode( int error = 0;
switch (goal) { - case XFS_ICWALK_DQRELE: - xfs_dqrele_inode(ip, icw); - break; case XFS_ICWALK_BLOCKGC: error = xfs_blockgc_scan_inode(ip, icw); break; diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 00dc98a92835..51d9beddbbc7 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -68,12 +68,6 @@ void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip);
void xfs_blockgc_worker(struct work_struct *work);
-#ifdef CONFIG_XFS_QUOTA -int xfs_dqrele_all_inodes(struct xfs_mount *mp, unsigned int qflags); -#else -# define xfs_dqrele_all_inodes(mp, qflags) (0) -#endif - int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, bool *inuse);
From: Christoph Hellwig hch@lst.de
mainline-inclusion from mainline-v5.14-rc4 commit e497dfba6bd7b11856fef4e9e8050de895b7faea category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
We always purge all dquots now, so drop the argument.
Signed-off-by: Christoph Hellwig hch@lst.de Reviewed-by: Carlos Maiolino cmaiolino@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_qm.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-)
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index dfee29e39d8e..a0f85d37eb85 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -185,15 +185,11 @@ xfs_qm_dqpurge( */ static void xfs_qm_dqpurge_all( - struct xfs_mount *mp, - uint flags) + struct xfs_mount *mp) { - if (flags & XFS_QMOPT_UQUOTA) - xfs_qm_dquot_walk(mp, XFS_DQTYPE_USER, xfs_qm_dqpurge, NULL); - if (flags & XFS_QMOPT_GQUOTA) - xfs_qm_dquot_walk(mp, XFS_DQTYPE_GROUP, xfs_qm_dqpurge, NULL); - if (flags & XFS_QMOPT_PQUOTA) - xfs_qm_dquot_walk(mp, XFS_DQTYPE_PROJ, xfs_qm_dqpurge, NULL); + xfs_qm_dquot_walk(mp, XFS_DQTYPE_USER, xfs_qm_dqpurge, NULL); + xfs_qm_dquot_walk(mp, XFS_DQTYPE_GROUP, xfs_qm_dqpurge, NULL); + xfs_qm_dquot_walk(mp, XFS_DQTYPE_PROJ, xfs_qm_dqpurge, NULL); }
/* @@ -204,7 +200,7 @@ xfs_qm_unmount( struct xfs_mount *mp) { if (mp->m_quotainfo) { - xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL); + xfs_qm_dqpurge_all(mp); xfs_qm_destroy_quotainfo(mp); } } @@ -1354,7 +1350,7 @@ xfs_qm_quotacheck( * at this point (because we intentionally didn't in dqget_noattach). */ if (error) { - xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL); + xfs_qm_dqpurge_all(mp); goto error_return; }
From: Christoph Hellwig hch@lst.de
mainline-inclusion from mainline-v5.14-rc4 commit 149e53afc851713a70b6a06ebfaf2ebf25975454 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
These only made a difference when quotaoff supported disabling quota accounting on a mounted file system, so we can switch everyone to use a single set of flags and helpers now. Note that the *QUOTA_ON naming for the helpers is kept as it was the much more commonly used one.
Signed-off-by: Christoph Hellwig hch@lst.de Reviewed-by: Carlos Maiolino cmaiolino@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/libxfs/xfs_quota_defs.h | 30 ++++-------------------- fs/xfs/scrub/quota.c | 2 +- fs/xfs/xfs_dquot.c | 3 --- fs/xfs/xfs_ioctl.c | 2 +- fs/xfs/xfs_iops.c | 4 ++-- fs/xfs/xfs_mount.c | 4 +--- fs/xfs/xfs_qm.c | 26 ++++++++++---------- fs/xfs/xfs_qm_syscalls.c | 2 +- fs/xfs/xfs_quotaops.c | 30 ++++++++---------------- fs/xfs/xfs_super.c | 43 ++++++++++++++-------------------- fs/xfs/xfs_trans_dquot.c | 11 ++++----- 11 files changed, 55 insertions(+), 102 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index 0f0af4e35032..a02c5062f9b2 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -60,36 +60,14 @@ typedef uint8_t xfs_dqtype_t; #define XFS_DQUOT_LOGRES(mp) \ ((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6)
-#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT) -#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT) -#define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT) -#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT) +#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT) +#define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT) +#define XFS_IS_PQUOTA_ON(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT) +#define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT) #define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD) #define XFS_IS_GQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_GQUOTA_ENFD) #define XFS_IS_PQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_PQUOTA_ENFD)
-/* - * Incore only flags for quotaoff - these bits get cleared when quota(s) - * are in the process of getting turned off. These flags are in m_qflags but - * never in sb_qflags. - */ -#define XFS_UQUOTA_ACTIVE 0x1000 /* uquotas are being turned off */ -#define XFS_GQUOTA_ACTIVE 0x2000 /* gquotas are being turned off */ -#define XFS_PQUOTA_ACTIVE 0x4000 /* pquotas are being turned off */ -#define XFS_ALL_QUOTA_ACTIVE \ - (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE) - -/* - * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees - * quota will be not be switched off as long as that inode lock is held. - */ -#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \ - XFS_GQUOTA_ACTIVE | \ - XFS_PQUOTA_ACTIVE)) -#define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACTIVE) -#define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACTIVE) -#define XFS_IS_PQUOTA_ON(mp) ((mp)->m_qflags & XFS_PQUOTA_ACTIVE) - /* * Flags to tell various functions what to do. Not all of these are meaningful * to a single function. None of these XFS_QMOPT_* flags are meant to have diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index e34ca20ae8e4..ef50fde9b622 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -43,7 +43,7 @@ xchk_setup_quota( xfs_dqtype_t dqtype; int error;
- if (!XFS_IS_QUOTA_RUNNING(sc->mp) || !XFS_IS_QUOTA_ON(sc->mp)) + if (!XFS_IS_QUOTA_ON(sc->mp)) return -ENOENT;
dqtype = xchk_quota_to_dqtype(sc); diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 1d95ed387d66..b8d0b305468b 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -808,9 +808,6 @@ xfs_qm_dqget_checks( struct xfs_mount *mp, xfs_dqtype_t type) { - if (WARN_ON_ONCE(!XFS_IS_QUOTA_RUNNING(mp))) - return -ESRCH; - switch (type) { case XFS_DQTYPE_USER: if (!XFS_IS_UQUOTA_ON(mp)) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 5eb78915412a..01d5a82edf6e 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1498,7 +1498,7 @@ xfs_ioctl_setattr(
/* Change the ownerships and register project quota modifications */ if (ip->i_d.di_projid != fa->fsx_projid) { - if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) { + if (XFS_IS_PQUOTA_ON(mp)) { olddquot = xfs_qm_vop_chown(tp, ip, &ip->i_pdquot, pdqp); } diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index afacde25c9ed..51f84c0e9417 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -738,7 +738,7 @@ xfs_setattr_nonsize( * in the transaction. */ if (!uid_eq(iuid, uid)) { - if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) { + if (XFS_IS_UQUOTA_ON(mp)) { ASSERT(mask & ATTR_UID); ASSERT(udqp); olddquot1 = xfs_qm_vop_chown(tp, ip, @@ -747,7 +747,7 @@ xfs_setattr_nonsize( inode->i_uid = uid; } if (!gid_eq(igid, gid)) { - if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) { + if (XFS_IS_GQUOTA_ON(mp)) { ASSERT(xfs_sb_version_has_pquotino(&mp->m_sb) || !XFS_IS_PQUOTA_ON(mp)); ASSERT(mask & ATTR_GID); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index d7ce93e16cb8..18106b08b0be 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -948,13 +948,11 @@ xfs_mountfs( /* * Initialise the XFS quota management subsystem for this mount */ - if (XFS_IS_QUOTA_RUNNING(mp)) { + if (XFS_IS_QUOTA_ON(mp)) { error = xfs_qm_newmount(mp, "amount, "aflags); if (error) goto out_rtunmount; } else { - ASSERT(!XFS_IS_QUOTA_ON(mp)); - /* * If a file system had quotas running earlier, but decided to * mount without -o uquota/pquota/gquota options, revoke the diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index a0f85d37eb85..601eec773bb2 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -293,8 +293,6 @@ xfs_qm_need_dqattach( { struct xfs_mount *mp = ip->i_mount;
- if (!XFS_IS_QUOTA_RUNNING(mp)) - return false; if (!XFS_IS_QUOTA_ON(mp)) return false; if (!XFS_NOT_DQATTACHED(mp, ip)) @@ -629,7 +627,7 @@ xfs_qm_init_quotainfo( struct xfs_quotainfo *qinf; int error;
- ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + ASSERT(XFS_IS_QUOTA_ON(mp));
qinf = mp->m_quotainfo = kmem_zalloc(sizeof(struct xfs_quotainfo), 0);
@@ -674,11 +672,11 @@ xfs_qm_init_quotainfo( xfs_qm_init_timelimits(mp, XFS_DQTYPE_GROUP); xfs_qm_init_timelimits(mp, XFS_DQTYPE_PROJ);
- if (XFS_IS_UQUOTA_RUNNING(mp)) + if (XFS_IS_UQUOTA_ON(mp)) xfs_qm_set_defquota(mp, XFS_DQTYPE_USER, qinf); - if (XFS_IS_GQUOTA_RUNNING(mp)) + if (XFS_IS_GQUOTA_ON(mp)) xfs_qm_set_defquota(mp, XFS_DQTYPE_GROUP, qinf); - if (XFS_IS_PQUOTA_RUNNING(mp)) + if (XFS_IS_PQUOTA_ON(mp)) xfs_qm_set_defquota(mp, XFS_DQTYPE_PROJ, qinf);
qinf->qi_shrinker.count_objects = xfs_qm_shrink_count; @@ -1136,7 +1134,7 @@ xfs_qm_dqusage_adjust( xfs_filblks_t rtblks = 0; /* total rt blks */ int error;
- ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + ASSERT(XFS_IS_QUOTA_ON(mp));
/* * rootino must have its resources accounted for, not so with the quota @@ -1279,7 +1277,7 @@ xfs_qm_quotacheck( flags = 0;
ASSERT(uip || gip || pip); - ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + ASSERT(XFS_IS_QUOTA_ON(mp));
xfs_notice(mp, "Quotacheck needed: Please wait.");
@@ -1409,7 +1407,7 @@ xfs_qm_mount_quotas( goto write_changes; }
- ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + ASSERT(XFS_IS_QUOTA_ON(mp));
/* * Allocate the quotainfo structure inside the mount struct, and @@ -1464,7 +1462,7 @@ xfs_qm_mount_quotas( * the incore structures are convinced that quotas are * off, but the on disk superblock doesn't know that ! */ - ASSERT(!(XFS_IS_QUOTA_RUNNING(mp))); + ASSERT(!(XFS_IS_QUOTA_ON(mp))); xfs_alert(mp, "%s: Superblock update failed!", __func__); } @@ -1636,7 +1634,7 @@ xfs_qm_vop_dqalloc( int error; uint lockflags;
- if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + if (!XFS_IS_QUOTA_ON(mp)) return 0;
lockflags = XFS_ILOCK_EXCL; @@ -1767,7 +1765,7 @@ xfs_qm_vop_chown(
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount)); + ASSERT(XFS_IS_QUOTA_ON(ip->i_mount));
/* old dquot */ prevdq = *IO_olddq; @@ -1820,7 +1818,7 @@ xfs_qm_vop_rename_dqattach( struct xfs_mount *mp = i_tab[0]->i_mount; int i;
- if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + if (!XFS_IS_QUOTA_ON(mp)) return 0;
for (i = 0; (i < 4 && i_tab[i]); i++) { @@ -1851,7 +1849,7 @@ xfs_qm_vop_create_dqattach( { struct xfs_mount *mp = tp->t_mountp;
- if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + if (!XFS_IS_QUOTA_ON(mp)) return;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 473b2458c6ca..7e7a7bead8c6 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -204,7 +204,7 @@ xfs_qm_scall_quotaon( (mp->m_qflags & XFS_GQUOTA_ACCT))) return 0;
- if (! XFS_IS_QUOTA_RUNNING(mp)) + if (!XFS_IS_QUOTA_ON(mp)) return -ESRCH;
/* diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index d27c0e852c0b..a7c0388c38e7 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -60,18 +60,18 @@ xfs_fs_get_quota_state( struct xfs_quotainfo *q = mp->m_quotainfo;
memset(state, 0, sizeof(*state)); - if (!XFS_IS_QUOTA_RUNNING(mp)) + if (!XFS_IS_QUOTA_ON(mp)) return 0; state->s_incoredqs = q->qi_dquots; - if (XFS_IS_UQUOTA_RUNNING(mp)) + if (XFS_IS_UQUOTA_ON(mp)) state->s_state[USRQUOTA].flags |= QCI_ACCT_ENABLED; if (XFS_IS_UQUOTA_ENFORCED(mp)) state->s_state[USRQUOTA].flags |= QCI_LIMITS_ENFORCED; - if (XFS_IS_GQUOTA_RUNNING(mp)) + if (XFS_IS_GQUOTA_ON(mp)) state->s_state[GRPQUOTA].flags |= QCI_ACCT_ENABLED; if (XFS_IS_GQUOTA_ENFORCED(mp)) state->s_state[GRPQUOTA].flags |= QCI_LIMITS_ENFORCED; - if (XFS_IS_PQUOTA_RUNNING(mp)) + if (XFS_IS_PQUOTA_ON(mp)) state->s_state[PRJQUOTA].flags |= QCI_ACCT_ENABLED; if (XFS_IS_PQUOTA_ENFORCED(mp)) state->s_state[PRJQUOTA].flags |= QCI_LIMITS_ENFORCED; @@ -114,10 +114,8 @@ xfs_fs_set_info(
if (sb_rdonly(sb)) return -EROFS; - if (!XFS_IS_QUOTA_RUNNING(mp)) - return -ENOSYS; if (!XFS_IS_QUOTA_ON(mp)) - return -ESRCH; + return -ENOSYS; if (info->i_fieldmask & ~XFS_QC_SETINFO_MASK) return -EINVAL; if ((info->i_fieldmask & XFS_QC_SETINFO_MASK) == 0) @@ -164,7 +162,7 @@ xfs_quota_enable(
if (sb_rdonly(sb)) return -EROFS; - if (!XFS_IS_QUOTA_RUNNING(mp)) + if (!XFS_IS_QUOTA_ON(mp)) return -ENOSYS;
return xfs_qm_scall_quotaon(mp, xfs_quota_flags(uflags)); @@ -179,10 +177,8 @@ xfs_quota_disable(
if (sb_rdonly(sb)) return -EROFS; - if (!XFS_IS_QUOTA_RUNNING(mp)) - return -ENOSYS; if (!XFS_IS_QUOTA_ON(mp)) - return -EINVAL; + return -ENOSYS;
return xfs_qm_scall_quotaoff(mp, xfs_quota_flags(uflags)); } @@ -223,10 +219,8 @@ xfs_fs_get_dqblk( struct xfs_mount *mp = XFS_M(sb); xfs_dqid_t id;
- if (!XFS_IS_QUOTA_RUNNING(mp)) - return -ENOSYS; if (!XFS_IS_QUOTA_ON(mp)) - return -ESRCH; + return -ENOSYS;
id = from_kqid(&init_user_ns, qid); return xfs_qm_scall_getquota(mp, id, xfs_quota_type(qid.type), qdq); @@ -243,10 +237,8 @@ xfs_fs_get_nextdqblk( struct xfs_mount *mp = XFS_M(sb); xfs_dqid_t id;
- if (!XFS_IS_QUOTA_RUNNING(mp)) - return -ENOSYS; if (!XFS_IS_QUOTA_ON(mp)) - return -ESRCH; + return -ENOSYS;
id = from_kqid(&init_user_ns, *qid); ret = xfs_qm_scall_getquota_next(mp, &id, xfs_quota_type(qid->type), @@ -269,10 +261,8 @@ xfs_fs_set_dqblk(
if (sb_rdonly(sb)) return -EROFS; - if (!XFS_IS_QUOTA_RUNNING(mp)) - return -ENOSYS; if (!XFS_IS_QUOTA_ON(mp)) - return -ESRCH; + return -ENOSYS;
return xfs_qm_scall_setqlim(mp, from_kqid(&init_user_ns, qid), xfs_quota_type(qid.type), qdq); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index bc2c1926f67f..515990265019 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -222,23 +222,20 @@ xfs_fs_show_options( seq_printf(m, ",swidth=%d", (int)XFS_FSB_TO_BB(mp, mp->m_swidth));
- if (mp->m_qflags & (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD)) + if (mp->m_qflags & XFS_UQUOTA_ENFD) seq_puts(m, ",usrquota"); else if (mp->m_qflags & XFS_UQUOTA_ACCT) seq_puts(m, ",uqnoenforce");
- if (mp->m_qflags & XFS_PQUOTA_ACCT) { - if (mp->m_qflags & XFS_PQUOTA_ENFD) - seq_puts(m, ",prjquota"); - else - seq_puts(m, ",pqnoenforce"); - } - if (mp->m_qflags & XFS_GQUOTA_ACCT) { - if (mp->m_qflags & XFS_GQUOTA_ENFD) - seq_puts(m, ",grpquota"); - else - seq_puts(m, ",gqnoenforce"); - } + if (mp->m_qflags & XFS_PQUOTA_ENFD) + seq_puts(m, ",prjquota"); + else if (mp->m_qflags & XFS_PQUOTA_ACCT) + seq_puts(m, ",pqnoenforce"); + + if (mp->m_qflags & XFS_GQUOTA_ENFD) + seq_puts(m, ",grpquota"); + else if (mp->m_qflags & XFS_GQUOTA_ACCT) + seq_puts(m, ",gqnoenforce");
if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT)) seq_puts(m, ",noquota"); @@ -1022,8 +1019,8 @@ xfs_finish_flags( return -EROFS; }
- if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) && - (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE)) && + if ((mp->m_qflags & XFS_GQUOTA_ACCT) && + (mp->m_qflags & XFS_PQUOTA_ACCT) && !xfs_sb_version_has_pquotino(&mp->m_sb)) { xfs_warn(mp, "Super block does not support project and group quota together"); @@ -1273,35 +1270,31 @@ xfs_fc_parse_param( case Opt_noquota: mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT; mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD; - mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE; return 0; case Opt_quota: case Opt_uquota: case Opt_usrquota: - mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE | - XFS_UQUOTA_ENFD); + mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ENFD); return 0; case Opt_qnoenforce: case Opt_uqnoenforce: - mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE); + mp->m_qflags |= XFS_UQUOTA_ACCT; mp->m_qflags &= ~XFS_UQUOTA_ENFD; return 0; case Opt_pquota: case Opt_prjquota: - mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE | - XFS_PQUOTA_ENFD); + mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ENFD); return 0; case Opt_pqnoenforce: - mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE); + mp->m_qflags |= XFS_PQUOTA_ACCT; mp->m_qflags &= ~XFS_PQUOTA_ENFD; return 0; case Opt_gquota: case Opt_grpquota: - mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE | - XFS_GQUOTA_ENFD); + mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ENFD); return 0; case Opt_gqnoenforce: - mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); + mp->m_qflags |= XFS_GQUOTA_ACCT; mp->m_qflags &= ~XFS_GQUOTA_ENFD; return 0; case Opt_discard: diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 3a7b4ad862a8..3d7386650cfe 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -139,8 +139,7 @@ xfs_trans_mod_dquot_byino( { xfs_mount_t *mp = tp->t_mountp;
- if (!XFS_IS_QUOTA_RUNNING(mp) || - !XFS_IS_QUOTA_ON(mp) || + if (!XFS_IS_QUOTA_ON(mp) || xfs_is_quota_inode(&mp->m_sb, ip->i_ino)) return;
@@ -202,7 +201,7 @@ xfs_trans_mod_dquot( struct xfs_dqtrx *qtrx;
ASSERT(tp); - ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp)); + ASSERT(XFS_IS_QUOTA_ON(tp->t_mountp)); qtrx = NULL;
if (tp->t_dqinfo == NULL) @@ -756,7 +755,7 @@ xfs_trans_reserve_quota_bydquots( { int error;
- if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + if (!XFS_IS_QUOTA_ON(mp)) return 0;
if (tp && tp->t_dqinfo == NULL) @@ -816,7 +815,7 @@ xfs_trans_reserve_quota_nblks( unsigned int qflags = 0; int error;
- if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + if (!XFS_IS_QUOTA_ON(mp)) return 0;
ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino)); @@ -857,7 +856,7 @@ xfs_trans_reserve_quota_icreate( { struct xfs_mount *mp = tp->t_mountp;
- if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + if (!XFS_IS_QUOTA_ON(mp)) return 0;
return xfs_trans_reserve_quota_bydquots(tp, mp, udqp, gdqp, pdqp,
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.14-rc4 commit c6c2066db3963e519a7ff8f432fcec956f4d23b4 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Move the xfs_inactive call and all the other debugging checks and stats updates into xfs_inode_mark_reclaimable because most of that are implementation details about the inode cache. This is preparation for deferred inactivation that is coming up. We also move it around xfs_icache.c in preparation for deferred inactivation.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 99 +++++++++++++++++++++++++++++++++------------ fs/xfs/xfs_super.c | 50 ----------------------- 2 files changed, 74 insertions(+), 75 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index e3c89c511ee5..0029e815bac3 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -291,31 +291,6 @@ xfs_perag_clear_inode_tag( trace_xfs_perag_clear_inode_tag(mp, pag->pag_agno, tag, _RET_IP_); }
-/* - * We set the inode flag atomically with the radix tree tag. - * Once we get tag lookups on the radix tree, this inode flag - * can go away. - */ -void -xfs_inode_mark_reclaimable( - struct xfs_inode *ip) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_perag *pag; - - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); - spin_lock(&pag->pag_ici_lock); - spin_lock(&ip->i_flags_lock); - - xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), - XFS_ICI_RECLAIM_TAG); - __xfs_iflags_set(ip, XFS_IRECLAIMABLE); - - spin_unlock(&ip->i_flags_lock); - spin_unlock(&pag->pag_ici_lock); - xfs_perag_put(pag); -} - static inline void xfs_inew_wait( struct xfs_inode *ip) @@ -1747,3 +1722,77 @@ xfs_icwalk( return last_error; BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_ICWALK_FLAGS_VALID); } + +#ifdef DEBUG +static void +xfs_check_delalloc( + struct xfs_inode *ip, + int whichfork) +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_bmbt_irec got; + struct xfs_iext_cursor icur; + + if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got)) + return; + do { + if (isnullstartblock(got.br_startblock)) { + xfs_warn(ip->i_mount, + "ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]", + ip->i_ino, + whichfork == XFS_DATA_FORK ? "data" : "cow", + got.br_startoff, got.br_blockcount); + } + } while (xfs_iext_next_extent(ifp, &icur, &got)); +} +#else +#define xfs_check_delalloc(ip, whichfork) do { } while (0) +#endif + +/* + * We set the inode flag atomically with the radix tree tag. + * Once we get tag lookups on the radix tree, this inode flag + * can go away. + */ +void +xfs_inode_mark_reclaimable( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + + xfs_inactive(ip); + + if (!XFS_FORCED_SHUTDOWN(mp) && ip->i_delayed_blks) { + xfs_check_delalloc(ip, XFS_DATA_FORK); + xfs_check_delalloc(ip, XFS_COW_FORK); + ASSERT(0); + } + + XFS_STATS_INC(mp, vn_reclaim); + + /* + * We should never get here with one of the reclaim flags already set. + */ + ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIMABLE)); + ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM)); + + /* + * We always use background reclaim here because even if the inode is + * clean, it still may be under IO and hence we have wait for IO + * completion to occur before we can reclaim the inode. The background + * reclaim path handles this more efficiently than we can here, so + * simply let background reclaim tear down all inodes. + */ + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + spin_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + + xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + __xfs_iflags_set(ip, XFS_IRECLAIMABLE); + + spin_unlock(&ip->i_flags_lock); + spin_unlock(&pag->pag_ici_lock); + xfs_perag_put(pag); +} diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 515990265019..c669e95aaaee 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -619,32 +619,6 @@ xfs_fs_alloc_inode( return NULL; }
-#ifdef DEBUG -static void -xfs_check_delalloc( - struct xfs_inode *ip, - int whichfork) -{ - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); - struct xfs_bmbt_irec got; - struct xfs_iext_cursor icur; - - if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got)) - return; - do { - if (isnullstartblock(got.br_startblock)) { - xfs_warn(ip->i_mount, - "ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]", - ip->i_ino, - whichfork == XFS_DATA_FORK ? "data" : "cow", - got.br_startoff, got.br_blockcount); - } - } while (xfs_iext_next_extent(ifp, &icur, &got)); -} -#else -#define xfs_check_delalloc(ip, whichfork) do { } while (0) -#endif - /* * Now that the generic code is guaranteed not to be accessing * the linux inode, we can inactivate and reclaim the inode. @@ -660,30 +634,6 @@ xfs_fs_destroy_inode( ASSERT(!rwsem_is_locked(&inode->i_rwsem)); XFS_STATS_INC(ip->i_mount, vn_rele); XFS_STATS_INC(ip->i_mount, vn_remove); - - xfs_inactive(ip); - - if (!XFS_FORCED_SHUTDOWN(ip->i_mount) && ip->i_delayed_blks) { - xfs_check_delalloc(ip, XFS_DATA_FORK); - xfs_check_delalloc(ip, XFS_COW_FORK); - ASSERT(0); - } - - XFS_STATS_INC(ip->i_mount, vn_reclaim); - - /* - * We should never get here with one of the reclaim flags already set. - */ - ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIMABLE)); - ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM)); - - /* - * We always use background reclaim here because even if the inode is - * clean, it still may be under IO and hence we have wait for IO - * completion to occur before we can reclaim the inode. The background - * reclaim path handles this more efficiently than we can here, so - * simply let background reclaim tear down all inodes. - */ xfs_inode_mark_reclaimable(ip); }
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.14-rc4 commit 62af7d54a0ec0b6f99d7d55ebeb9ecbb3371bc67 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
If we don't need to inactivate an inode, we can detach the dquots and move on to reclamation. This isn't strictly required here; it's a preparation patch for deferred inactivation per reviewer request[1] to move the creation of xfs_inode_needs_inactivation into a separate change. Eventually this !need_inactive chunk will turn into the code path for inodes that skip xfs_inactive and go straight to memory reclaim.
[1] https://lore.kernel.org/linux-xfs/20210609012838.GW2945738@locust/T/#mca6d95... Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 8 ++++++- fs/xfs/xfs_inode.c | 53 +++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_inode.h | 2 ++ 3 files changed, 62 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 0029e815bac3..80cab7adad6c 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1760,8 +1760,14 @@ xfs_inode_mark_reclaimable( { struct xfs_mount *mp = ip->i_mount; struct xfs_perag *pag; + bool need_inactive = xfs_inode_needs_inactive(ip);
- xfs_inactive(ip); + if (!need_inactive) { + /* Going straight to reclaim, so drop the dquots. */ + xfs_qm_dqdetach(ip); + } else { + xfs_inactive(ip); + }
if (!XFS_FORCED_SHUTDOWN(mp) && ip->i_delayed_blks) { xfs_check_delalloc(ip, XFS_DATA_FORK); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 4b7228e2e39e..64b8408af9f1 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1781,6 +1781,59 @@ xfs_inactive_ifree( return 0; }
+/* + * Returns true if we need to update the on-disk metadata before we can free + * the memory used by this inode. Updates include freeing post-eof + * preallocations; freeing COW staging extents; and marking the inode free in + * the inobt if it is on the unlinked list. + */ +bool +xfs_inode_needs_inactive( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *cow_ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + + /* + * If the inode is already free, then there can be nothing + * to clean up here. + */ + if (VFS_I(ip)->i_mode == 0) + return false; + + /* If this is a read-only mount, don't do this (would generate I/O) */ + if (mp->m_flags & XFS_MOUNT_RDONLY) + return false; + + /* If the log isn't running, push inodes straight to reclaim. */ + if (XFS_FORCED_SHUTDOWN(mp) || (mp->m_flags & XFS_MOUNT_NORECOVERY)) + return false; + + /* Metadata inodes require explicit resource cleanup. */ + if (xfs_is_metadata_inode(ip)) + return false; + + /* Want to clean out the cow blocks if there are any. */ + if (cow_ifp && cow_ifp->if_bytes > 0) + return true; + + /* Unlinked files must be freed. */ + if (VFS_I(ip)->i_nlink == 0) + return true; + + /* + * This file isn't being freed, so check if there are post-eof blocks + * to free. @force is true because we are evicting an inode from the + * cache. Post-eof blocks must be freed, lest we end up with broken + * free space accounting. + * + * Note: don't bother with iolock here since lockdep complains about + * acquiring it in reclaim context. We have the only reference to the + * inode at this point anyways. + */ + return xfs_can_free_eofblocks(ip, true); +} + /* * xfs_inactive * diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 51cb2ba5462f..20ab29f06d09 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -476,6 +476,8 @@ extern struct kmem_zone *xfs_inode_zone; /* The default CoW extent size hint. */ #define XFS_DEFAULT_COWEXTSZ_HINT 32
+bool xfs_inode_needs_inactive(struct xfs_inode *ip); + int xfs_iunlink_init(struct xfs_perag *pag); void xfs_iunlink_destroy(struct xfs_perag *pag);
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.14-rc4 commit ab23a7768739a23d21d8a16ca37dff96b1ca957a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Move inode inactivation to background work contexts so that it no longer runs in the context that releases the final reference to an inode. This will allow process work that ends up blocking on inactivation to continue doing work while the filesytem processes the inactivation in the background.
A typical demonstration of this is unlinking an inode with lots of extents. The extents are removed during inactivation, so this blocks the process that unlinked the inode from the directory structure. By moving the inactivation to the background process, the userspace applicaiton can keep working (e.g. unlinking the next inode in the directory) while the inactivation work on the previous inode is done by a different CPU.
The implementation of the queue is relatively simple. We use a per-cpu lockless linked list (llist) to queue inodes for inactivation without requiring serialisation mechanisms, and a work item to allow the queue to be processed by a CPU bound worker thread. We also keep a count of the queue depth so that we can trigger work after a number of deferred inactivations have been queued.
The use of a bound workqueue with a single work depth allows the workqueue to run one work item per CPU. We queue the work item on the CPU we are currently running on, and so this essentially gives us affine per-cpu worker threads for the per-cpu queues. THis maintains the effective CPU affinity that occurs within XFS at the AG level due to all objects in a directory being local to an AG. Hence inactivation work tends to run on the same CPU that last accessed all the objects that inactivation accesses and this maintains hot CPU caches for unlink workloads.
A depth of 32 inodes was chosen to match the number of inodes in an inode cluster buffer. This hopefully allows sequential allocation/unlink behaviours to defering inactivation of all the inodes in a single cluster buffer at a time, further helping maintain hot CPU and buffer cache accesses while running inactivations.
A hard per-cpu queue throttle of 256 inode has been set to avoid runaway queuing when inodes that take a long to time inactivate are being processed. For example, when unlinking inodes with large numbers of extents that can take a lot of processing to free.
Signed-off-by: Dave Chinner dchinner@redhat.com [djwong: tweak comments and tracepoints, convert opflags to state bits] Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/scrub/common.c | 7 + fs/xfs/xfs_icache.c | 346 +++++++++++++++++++++++++++++++++++---- fs/xfs/xfs_icache.h | 6 + fs/xfs/xfs_inode.h | 20 ++- fs/xfs/xfs_log_recover.c | 7 + fs/xfs/xfs_mount.c | 26 ++- fs/xfs/xfs_mount.h | 43 ++++- fs/xfs/xfs_super.c | 114 +++++++++++-- fs/xfs/xfs_trace.h | 50 +++++- 9 files changed, 571 insertions(+), 48 deletions(-)
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 922c1d84006b..5743cf6f8b18 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -905,6 +905,7 @@ xchk_stop_reaping( { sc->flags |= XCHK_REAPING_DISABLED; xfs_blockgc_stop(sc->mp); + xfs_inodegc_stop(sc->mp); }
/* Restart background reaping of resources. */ @@ -912,6 +913,12 @@ void xchk_start_reaping( struct xfs_scrub *sc) { + /* + * Readonly filesystems do not perform inactivation, so there's no + * need to restart the worker. + */ + if (!(sc->mp->m_flags & XFS_MOUNT_RDONLY)) + xfs_inodegc_start(sc->mp); xfs_blockgc_start(sc->mp); sc->flags &= ~XCHK_REAPING_DISABLED; } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 80cab7adad6c..eb27e11d67f1 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -212,7 +212,7 @@ xfs_blockgc_queue( { rcu_read_lock(); if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) - queue_delayed_work(pag->pag_mount->m_gc_workqueue, + queue_delayed_work(pag->pag_mount->m_blockgc_wq, &pag->pag_blockgc_work, msecs_to_jiffies(xfs_blockgc_secs * 1000)); rcu_read_unlock(); @@ -449,6 +449,21 @@ xfs_iget_check_free_state( return 0; }
+/* Make all pending inactivation work start immediately. */ +static void +xfs_inodegc_queue_all( + struct xfs_mount *mp) +{ + struct xfs_inodegc *gc; + int cpu; + + for_each_online_cpu(cpu) { + gc = per_cpu_ptr(mp->m_inodegc, cpu); + if (!llist_empty(&gc->list)) + queue_work_on(cpu, mp->m_inodegc_wq, &gc->work); + } +} + /* * Check the validity of the inode we just found it the cache */ @@ -481,13 +496,30 @@ xfs_iget_cache_hit( * reclaimable state, wait for the initialisation to complete * before continuing. * + * If we're racing with the inactivation worker we also want to wait. + * If we're creating a new file, it's possible that the worker + * previously marked the inode as free on disk but hasn't finished + * updating the incore state yet. The AGI buffer will be dirty and + * locked to the icreate transaction, so a synchronous push of the + * inodegc workers would result in deadlock. For a regular iget, the + * worker is running already, so we might as well wait. + * * XXX(hch): eventually we should do something equivalent to * wait_on_inode to wait for these flags to be cleared * instead of polling for it. */ - if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM)) + if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM | XFS_INACTIVATING)) goto out_skip;
+ if (ip->i_flags & XFS_NEED_INACTIVE) { + /* Unlinked inodes cannot be re-grabbed. */ + if (VFS_I(ip)->i_nlink == 0) { + error = -ENOENT; + goto out_error; + } + goto out_inodegc_flush; + } + /* * Check the inode free state is valid. This also detects lookup * racing with unlinks. @@ -535,6 +567,17 @@ xfs_iget_cache_hit( spin_unlock(&ip->i_flags_lock); rcu_read_unlock(); return error; + +out_inodegc_flush: + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + /* + * Do not wait for the workers, because the caller could hold an AGI + * buffer lock. We're just going to sleep in a loop anyway. + */ + if (xfs_is_inodegc_enabled(mp)) + xfs_inodegc_queue_all(mp); + return -EAGAIN; }
static int @@ -862,6 +905,7 @@ xfs_reclaim_inode(
xfs_iflags_clear(ip, XFS_IFLUSHING); reclaim: + trace_xfs_inode_reclaiming(ip);
/* * Because we use RCU freeing we need to ensure the inode always appears @@ -1348,6 +1392,8 @@ xfs_blockgc_start(
/* Don't try to run block gc on an inode that's in any of these states. */ #define XFS_BLOCKGC_NOGRAB_IFLAGS (XFS_INEW | \ + XFS_NEED_INACTIVE | \ + XFS_INACTIVATING | \ XFS_IRECLAIMABLE | \ XFS_IRECLAIM) /* @@ -1749,25 +1795,13 @@ xfs_check_delalloc( #define xfs_check_delalloc(ip, whichfork) do { } while (0) #endif
-/* - * We set the inode flag atomically with the radix tree tag. - * Once we get tag lookups on the radix tree, this inode flag - * can go away. - */ -void -xfs_inode_mark_reclaimable( +/* Schedule the inode for reclaim. */ +static void +xfs_inodegc_set_reclaimable( struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; struct xfs_perag *pag; - bool need_inactive = xfs_inode_needs_inactive(ip); - - if (!need_inactive) { - /* Going straight to reclaim, so drop the dquots. */ - xfs_qm_dqdetach(ip); - } else { - xfs_inactive(ip); - }
if (!XFS_FORCED_SHUTDOWN(mp) && ip->i_delayed_blks) { xfs_check_delalloc(ip, XFS_DATA_FORK); @@ -1775,30 +1809,276 @@ xfs_inode_mark_reclaimable( ASSERT(0); }
- XFS_STATS_INC(mp, vn_reclaim); - - /* - * We should never get here with one of the reclaim flags already set. - */ - ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIMABLE)); - ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM)); - - /* - * We always use background reclaim here because even if the inode is - * clean, it still may be under IO and hence we have wait for IO - * completion to occur before we can reclaim the inode. The background - * reclaim path handles this more efficiently than we can here, so - * simply let background reclaim tear down all inodes. - */ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); spin_lock(&pag->pag_ici_lock); spin_lock(&ip->i_flags_lock);
+ trace_xfs_inode_set_reclaimable(ip); + ip->i_flags &= ~(XFS_NEED_INACTIVE | XFS_INACTIVATING); + ip->i_flags |= XFS_IRECLAIMABLE; xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); - __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
spin_unlock(&ip->i_flags_lock); spin_unlock(&pag->pag_ici_lock); xfs_perag_put(pag); } + +/* + * Free all speculative preallocations and possibly even the inode itself. + * This is the last chance to make changes to an otherwise unreferenced file + * before incore reclamation happens. + */ +static void +xfs_inodegc_inactivate( + struct xfs_inode *ip) +{ + trace_xfs_inode_inactivating(ip); + xfs_inactive(ip); + xfs_inodegc_set_reclaimable(ip); +} + +void +xfs_inodegc_worker( + struct work_struct *work) +{ + struct xfs_inodegc *gc = container_of(work, struct xfs_inodegc, + work); + struct llist_node *node = llist_del_all(&gc->list); + struct xfs_inode *ip, *n; + + WRITE_ONCE(gc->items, 0); + + if (!node) + return; + + ip = llist_entry(node, struct xfs_inode, i_gclist); + trace_xfs_inodegc_worker(ip->i_mount, __return_address); + + llist_for_each_entry_safe(ip, n, node, i_gclist) { + xfs_iflags_set(ip, XFS_INACTIVATING); + xfs_inodegc_inactivate(ip); + } +} + +/* + * Force all currently queued inode inactivation work to run immediately, and + * wait for the work to finish. Two pass - queue all the work first pass, wait + * for it in a second pass. + */ +void +xfs_inodegc_flush( + struct xfs_mount *mp) +{ + struct xfs_inodegc *gc; + int cpu; + + if (!xfs_is_inodegc_enabled(mp)) + return; + + trace_xfs_inodegc_flush(mp, __return_address); + + xfs_inodegc_queue_all(mp); + + for_each_online_cpu(cpu) { + gc = per_cpu_ptr(mp->m_inodegc, cpu); + flush_work(&gc->work); + } +} + +/* + * Flush all the pending work and then disable the inode inactivation background + * workers and wait for them to stop. + */ +void +xfs_inodegc_stop( + struct xfs_mount *mp) +{ + struct xfs_inodegc *gc; + int cpu; + + if (!xfs_clear_inodegc_enabled(mp)) + return; + + xfs_inodegc_queue_all(mp); + + for_each_online_cpu(cpu) { + gc = per_cpu_ptr(mp->m_inodegc, cpu); + cancel_work_sync(&gc->work); + } + trace_xfs_inodegc_stop(mp, __return_address); +} + +/* + * Enable the inode inactivation background workers and schedule deferred inode + * inactivation work if there is any. + */ +void +xfs_inodegc_start( + struct xfs_mount *mp) +{ + if (xfs_set_inodegc_enabled(mp)) + return; + + trace_xfs_inodegc_start(mp, __return_address); + xfs_inodegc_queue_all(mp); +} + +/* + * Schedule the inactivation worker when: + * + * - We've accumulated more than one inode cluster buffer's worth of inodes. + */ +static inline bool +xfs_inodegc_want_queue_work( + struct xfs_inode *ip, + unsigned int items) +{ + struct xfs_mount *mp = ip->i_mount; + + if (items > mp->m_ino_geo.inodes_per_cluster) + return true; + + return false; +} + +/* + * Upper bound on the number of inodes in each AG that can be queued for + * inactivation at any given time, to avoid monopolizing the workqueue. + */ +#define XFS_INODEGC_MAX_BACKLOG (4 * XFS_INODES_PER_CHUNK) + +/* + * Make the frontend wait for inactivations when: + * + * - The queue depth exceeds the maximum allowable percpu backlog. + * + * Note: If the current thread is running a transaction, we don't ever want to + * wait for other transactions because that could introduce a deadlock. + */ +static inline bool +xfs_inodegc_want_flush_work( + struct xfs_inode *ip, + unsigned int items) +{ + if (current->journal_info) + return false; + + if (items > XFS_INODEGC_MAX_BACKLOG) + return true; + + return false; +} + +/* + * Queue a background inactivation worker if there are inodes that need to be + * inactivated and higher level xfs code hasn't disabled the background + * workers. + */ +static void +xfs_inodegc_queue( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_inodegc *gc; + int items; + + trace_xfs_inode_set_need_inactive(ip); + spin_lock(&ip->i_flags_lock); + ip->i_flags |= XFS_NEED_INACTIVE; + spin_unlock(&ip->i_flags_lock); + + gc = get_cpu_ptr(mp->m_inodegc); + llist_add(&ip->i_gclist, &gc->list); + items = READ_ONCE(gc->items); + WRITE_ONCE(gc->items, items + 1); + put_cpu_ptr(gc); + + if (!xfs_is_inodegc_enabled(mp)) + return; + + if (xfs_inodegc_want_queue_work(ip, items)) { + trace_xfs_inodegc_queue(mp, __return_address); + queue_work(mp->m_inodegc_wq, &gc->work); + } + + if (xfs_inodegc_want_flush_work(ip, items)) { + trace_xfs_inodegc_throttle(mp, __return_address); + flush_work(&gc->work); + } +} + +/* + * Fold the dead CPU inodegc queue into the current CPUs queue. + */ +void +xfs_inodegc_cpu_dead( + struct xfs_mount *mp, + unsigned int dead_cpu) +{ + struct xfs_inodegc *dead_gc, *gc; + struct llist_node *first, *last; + unsigned int count = 0; + + dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu); + cancel_work_sync(&dead_gc->work); + + if (llist_empty(&dead_gc->list)) + return; + + first = dead_gc->list.first; + last = first; + while (last->next) { + last = last->next; + count++; + } + dead_gc->list.first = NULL; + dead_gc->items = 0; + + /* Add pending work to current CPU */ + gc = get_cpu_ptr(mp->m_inodegc); + llist_add_batch(first, last, &gc->list); + count += READ_ONCE(gc->items); + WRITE_ONCE(gc->items, count); + put_cpu_ptr(gc); + + if (xfs_is_inodegc_enabled(mp)) { + trace_xfs_inodegc_queue(mp, __return_address); + queue_work(mp->m_inodegc_wq, &gc->work); + } +} + +/* + * We set the inode flag atomically with the radix tree tag. Once we get tag + * lookups on the radix tree, this inode flag can go away. + * + * We always use background reclaim here because even if the inode is clean, it + * still may be under IO and hence we have wait for IO completion to occur + * before we can reclaim the inode. The background reclaim path handles this + * more efficiently than we can here, so simply let background reclaim tear down + * all inodes. + */ +void +xfs_inode_mark_reclaimable( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + bool need_inactive; + + XFS_STATS_INC(mp, vn_reclaim); + + /* + * We should never get here with any of the reclaim flags already set. + */ + ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_ALL_IRECLAIM_FLAGS)); + + need_inactive = xfs_inode_needs_inactive(ip); + if (need_inactive) { + xfs_inodegc_queue(ip); + return; + } + + /* Going straight to reclaim, so drop the dquots. */ + xfs_qm_dqdetach(ip); + xfs_inodegc_set_reclaimable(ip); +} diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 51d9beddbbc7..b48a368c43e5 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -74,4 +74,10 @@ int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp, void xfs_blockgc_stop(struct xfs_mount *mp); void xfs_blockgc_start(struct xfs_mount *mp);
+void xfs_inodegc_worker(struct work_struct *work); +void xfs_inodegc_flush(struct xfs_mount *mp); +void xfs_inodegc_stop(struct xfs_mount *mp); +void xfs_inodegc_start(struct xfs_mount *mp); +void xfs_inodegc_cpu_dead(struct xfs_mount *mp, unsigned int cpu); + #endif diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 20ab29f06d09..57141daff28e 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -42,6 +42,7 @@ typedef struct xfs_inode { mrlock_t i_lock; /* inode lock */ mrlock_t i_mmaplock; /* inode mmap IO lock */ atomic_t i_pincount; /* inode pin count */ + struct llist_node i_gclist; /* deferred inactivation list */
/* * Bitsets of inode metadata that have been checked and/or are sick. @@ -228,6 +229,7 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip) #define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */ #define XFS_IPINNED (1 << __XFS_IPINNED_BIT) #define XFS_IEOFBLOCKS (1 << 9) /* has the preallocblocks tag set */ +#define XFS_NEED_INACTIVE (1 << 10) /* see XFS_INACTIVATING below */ /* * If this unlinked inode is in the middle of recovery, don't let drop_inode * truncate and free the inode. This can happen if we iget the inode during @@ -236,6 +238,21 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip) #define XFS_IRECOVERY (1 << 11) #define XFS_ICOWBLOCKS (1 << 12)/* has the cowblocks tag set */
+/* + * If we need to update on-disk metadata before this IRECLAIMABLE inode can be + * freed, then NEED_INACTIVE will be set. Once we start the updates, the + * INACTIVATING bit will be set to keep iget away from this inode. After the + * inactivation completes, both flags will be cleared and the inode is a + * plain old IRECLAIMABLE inode. + */ +#define XFS_INACTIVATING (1 << 13) + +/* All inode state flags related to inode reclaim. */ +#define XFS_ALL_IRECLAIM_FLAGS (XFS_IRECLAIMABLE | \ + XFS_IRECLAIM | \ + XFS_NEED_INACTIVE | \ + XFS_INACTIVATING) + /* * Per-lifetime flags need to be reset when re-using a reclaimable inode during * inode lookup. This prevents unintended behaviour on the new inode from @@ -243,7 +260,8 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip) */ #define XFS_IRECLAIM_RESET_FLAGS \ (XFS_IRECLAIMABLE | XFS_IRECLAIM | \ - XFS_IDIRTY_RELEASE | XFS_ITRUNCATED) + XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | XFS_NEED_INACTIVE | \ + XFS_INACTIVATING)
/* * Flags for inode locking. diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index d5a8bea48be2..36f01c8eb57e 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2791,6 +2791,13 @@ xlog_recover_process_iunlinks( } xfs_buf_rele(agibp); } + + /* + * Flush the pending unlinked inodes to ensure that the inactivations + * are fully completed on disk and the incore inodes can be reclaimed + * before we signal that recovery is complete. + */ + xfs_inodegc_flush(mp); }
STATIC void diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 18106b08b0be..4b2644c76f59 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -638,7 +638,8 @@ xfs_check_summary_counts( * Flush and reclaim dirty inodes in preparation for unmount. Inodes and * internal inode structures can be sitting in the CIL and AIL at this point, * so we need to unpin them, write them back and/or reclaim them before unmount - * can proceed. + * can proceed. In other words, callers are required to have inactivated all + * inodes. * * An inode cluster that has been freed can have its buffer still pinned in * memory because the transaction is still sitting in a iclog. The stale inodes @@ -670,6 +671,7 @@ xfs_unmount_flush_inodes( mp->m_flags |= XFS_MOUNT_UNMOUNTING;
xfs_ail_push_all_sync(mp->m_ail); + xfs_inodegc_stop(mp); cancel_delayed_work_sync(&mp->m_reclaim_work); xfs_reclaim_inodes(mp); xfs_health_unmount(mp); @@ -894,6 +896,9 @@ xfs_mountfs( if (error) goto out_log_dealloc;
+ /* Enable background inode inactivation workers. */ + xfs_inodegc_start(mp); + /* * Get and sanity-check the root inode. * Save the pointer to it in the mount structure. @@ -1054,6 +1059,15 @@ xfs_mountfs( xfs_irele(rip); /* Clean out dquots that might be in memory after quotacheck. */ xfs_qm_unmount(mp); + + /* + * Inactivate all inodes that might still be in memory after a log + * intent recovery failure so that reclaim can free them. Metadata + * inodes and the root directory shouldn't need inactivation, but the + * mount failed for some reason, so pull down all the state and flee. + */ + xfs_inodegc_flush(mp); + /* * Flush all inode reclamation work and flush the log. * We have to do this /after/ rtunmount and qm_unmount because those @@ -1101,6 +1115,16 @@ xfs_unmountfs( uint64_t resblks; int error;
+ /* + * Perform all on-disk metadata updates required to inactivate inodes + * that the VFS evicted earlier in the unmount process. Freeing inodes + * and discarding CoW fork preallocations can cause shape changes to + * the free inode and refcount btrees, respectively, so we must finish + * this before we discard the metadata space reservations. Metadata + * inodes and the root directory do not require inactivation. + */ + xfs_inodegc_flush(mp); + xfs_blockgc_stop(mp); xfs_fs_unreserve_ag_blocks(mp); xfs_qm_unmount_quotas(mp); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 3bae88b45fca..85109bbb3bff 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -55,6 +55,17 @@ struct xfs_error_cfg { long retry_timeout; /* in jiffies, -1 = infinite */ };
+/* + * Per-cpu deferred inode inactivation GC lists. + */ +struct xfs_inodegc { + struct llist_head list; + struct work_struct work; + + /* approximate count of inodes in the list */ + unsigned int items; +}; + /* * The struct xfsmount layout is optimised to separate read-mostly variables * from variables that are frequently modified. We put the read-mostly variables @@ -82,6 +93,8 @@ typedef struct xfs_mount { xfs_buftarg_t *m_logdev_targp;/* ptr to log device */ xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */ struct list_head m_mount_list; /* global mount list */ + void __percpu *m_inodegc; /* percpu inodegc structures */ + /* * Optional cache of rt summary level per bitmap block with the * invariant that m_rsum_cache[bbno] <= the minimum i for which @@ -94,8 +107,9 @@ typedef struct xfs_mount { struct workqueue_struct *m_unwritten_workqueue; struct workqueue_struct *m_cil_workqueue; struct workqueue_struct *m_reclaim_workqueue; - struct workqueue_struct *m_gc_workqueue; struct workqueue_struct *m_sync_workqueue; + struct workqueue_struct *m_blockgc_wq; + struct workqueue_struct *m_inodegc_wq;
int m_bsize; /* fs logical block size */ uint8_t m_blkbit_log; /* blocklog + NBBY */ @@ -136,6 +150,7 @@ typedef struct xfs_mount { struct xfs_ino_geometry m_ino_geo; /* inode geometry */ struct xfs_trans_resv m_resv; /* precomputed res values */ /* low free space thresholds */ + unsigned long m_opstate; /* dynamic state flags */ bool m_always_cow; bool m_fail_unmount; bool m_finobt_nores; /* no per-AG finobt resv. */ @@ -252,6 +267,32 @@ typedef struct xfs_mount { #define XFS_MOUNT_DAX_ALWAYS (1ULL << 26) #define XFS_MOUNT_DAX_NEVER (1ULL << 27)
+/* + * If set, inactivation worker threads will be scheduled to process queued + * inodegc work. If not, queued inodes remain in memory waiting to be + * processed. + */ +#define XFS_OPSTATE_INODEGC_ENABLED 0 + +#define __XFS_IS_OPSTATE(name, NAME) \ +static inline bool xfs_is_ ## name (struct xfs_mount *mp) \ +{ \ + return test_bit(XFS_OPSTATE_ ## NAME, &mp->m_opstate); \ +} \ +static inline bool xfs_clear_ ## name (struct xfs_mount *mp) \ +{ \ + return test_and_clear_bit(XFS_OPSTATE_ ## NAME, &mp->m_opstate); \ +} \ +static inline bool xfs_set_ ## name (struct xfs_mount *mp) \ +{ \ + return test_and_set_bit(XFS_OPSTATE_ ## NAME, &mp->m_opstate); \ +} + +__XFS_IS_OPSTATE(inodegc_enabled, INODEGC_ENABLED) + +#define XFS_OPSTATE_STRINGS \ + { (1UL << XFS_OPSTATE_INODEGC_ENABLED), "inodegc" } + /* * Max and min values for mount-option defined I/O * preallocation sizes. diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index c669e95aaaee..11557d234e23 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -536,21 +536,29 @@ xfs_init_mount_workqueues( if (!mp->m_reclaim_workqueue) goto out_destroy_cil;
- mp->m_gc_workqueue = alloc_workqueue("xfs-blockgc/%s", - WQ_SYSFS | WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM, + mp->m_blockgc_wq = alloc_workqueue("xfs-blockgc/%s", + XFS_WQFLAGS(WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM), 0, mp->m_super->s_id); - if (!mp->m_gc_workqueue) + if (!mp->m_blockgc_wq) goto out_destroy_reclaim;
+ mp->m_inodegc_wq = alloc_workqueue("xfs-inodegc/%s", + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + 1, mp->m_super->s_id); + if (!mp->m_inodegc_wq) + goto out_destroy_blockgc; + mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", XFS_WQFLAGS(WQ_FREEZABLE), 0, mp->m_super->s_id); if (!mp->m_sync_workqueue) - goto out_destroy_eofb; + goto out_destroy_inodegc;
return 0;
-out_destroy_eofb: - destroy_workqueue(mp->m_gc_workqueue); +out_destroy_inodegc: + destroy_workqueue(mp->m_inodegc_wq); +out_destroy_blockgc: + destroy_workqueue(mp->m_blockgc_wq); out_destroy_reclaim: destroy_workqueue(mp->m_reclaim_workqueue); out_destroy_cil: @@ -568,7 +576,8 @@ xfs_destroy_mount_workqueues( struct xfs_mount *mp) { destroy_workqueue(mp->m_sync_workqueue); - destroy_workqueue(mp->m_gc_workqueue); + destroy_workqueue(mp->m_blockgc_wq); + destroy_workqueue(mp->m_inodegc_wq); destroy_workqueue(mp->m_reclaim_workqueue); destroy_workqueue(mp->m_cil_workqueue); destroy_workqueue(mp->m_unwritten_workqueue); @@ -730,6 +739,8 @@ xfs_fs_sync_fs( { struct xfs_mount *mp = XFS_M(sb);
+ trace_xfs_fs_sync_fs(mp, __return_address); + /* * Doing anything during the async pass would be counterproductive. */ @@ -746,6 +757,22 @@ xfs_fs_sync_fs( flush_delayed_work(&mp->m_log->l_work); }
+ /* + * If we are called with page faults frozen out, it means we are about + * to freeze the transaction subsystem. Take the opportunity to shut + * down inodegc because once SB_FREEZE_FS is set it's too late to + * prevent inactivation races with freeze. The fs doesn't get called + * again by the freezing process until after SB_FREEZE_FS has been set, + * so it's now or never. + * + * We don't care if this is a normal syncfs call that does this or + * freeze that does this - we can run this multiple times without issue + * and we won't race with a restart because a restart can only occur + * when the state is either SB_FREEZE_FS or SB_FREEZE_COMPLETE. + */ + if (sb->s_writers.frozen == SB_FREEZE_PAGEFAULT) + xfs_inodegc_stop(mp); + return 0; }
@@ -897,6 +924,17 @@ xfs_fs_freeze( xfs_quiesce_attr(mp); ret = xfs_sync_sb(mp, true); memalloc_nofs_restore(flags); + + /* + * For read-write filesystems, we need to restart the inodegc on error + * because we stopped it at SB_FREEZE_PAGEFAULT level and a thaw is not + * going to be run to restart it now. We are at SB_FREEZE_FS level + * here, so we can restart safely without racing with a stop in + * xfs_fs_sync_fs(). + */ + if (ret && !(mp->m_flags & XFS_MOUNT_RDONLY)) + xfs_inodegc_start(mp); + return ret; }
@@ -909,6 +947,14 @@ xfs_fs_unfreeze( xfs_restore_resvblks(mp); xfs_log_work_queue(mp); xfs_blockgc_start(mp); + + /* + * Don't reactivate the inodegc worker on a readonly filesystem because + * inodes are sent directly to reclaim. + */ + if (!(mp->m_flags & XFS_MOUNT_RDONLY)) + xfs_inodegc_start(mp); + return 0; }
@@ -1034,6 +1080,35 @@ xfs_destroy_percpu_counters( percpu_counter_destroy(&mp->m_delalloc_blks); }
+static int +xfs_inodegc_init_percpu( + struct xfs_mount *mp) +{ + struct xfs_inodegc *gc; + int cpu; + + mp->m_inodegc = alloc_percpu(struct xfs_inodegc); + if (!mp->m_inodegc) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + gc = per_cpu_ptr(mp->m_inodegc, cpu); + init_llist_head(&gc->list); + gc->items = 0; + INIT_WORK(&gc->work, xfs_inodegc_worker); + } + return 0; +} + +static void +xfs_inodegc_free_percpu( + struct xfs_mount *mp) +{ + if (!mp->m_inodegc) + return; + free_percpu(mp->m_inodegc); +} + static void xfs_fs_put_super( struct super_block *sb) @@ -1051,6 +1126,7 @@ xfs_fs_put_super( xfs_freesb(mp); free_percpu(mp->m_stats.xs_stats); xfs_mount_list_del(mp); + xfs_inodegc_free_percpu(mp); xfs_destroy_percpu_counters(mp); xfs_destroy_mount_workqueues(mp); xfs_close_devices(mp); @@ -1406,6 +1482,10 @@ xfs_fc_fill_super( if (error) goto out_destroy_workqueues;
+ error = xfs_inodegc_init_percpu(mp); + if (error) + goto out_destroy_counters; + /* * All percpu data structures requiring cleanup when a cpu goes offline * must be allocated before adding this @mp to the cpu-dead handler's @@ -1417,7 +1497,7 @@ xfs_fc_fill_super( mp->m_stats.xs_stats = alloc_percpu(struct xfsstats); if (!mp->m_stats.xs_stats) { error = -ENOMEM; - goto out_destroy_counters; + goto out_destroy_inodegc; }
error = xfs_readsb(mp, flags); @@ -1581,8 +1661,10 @@ xfs_fc_fill_super( xfs_freesb(mp); out_free_stats: free_percpu(mp->m_stats.xs_stats); - out_destroy_counters: + out_destroy_inodegc: xfs_mount_list_del(mp); + xfs_inodegc_free_percpu(mp); + out_destroy_counters: xfs_destroy_percpu_counters(mp); out_destroy_workqueues: xfs_destroy_mount_workqueues(mp); @@ -1665,6 +1747,9 @@ xfs_remount_rw( if (error && error != -ENOSPC) return error;
+ /* Re-enable the background inode inactivation worker. */ + xfs_inodegc_start(mp); + return 0; }
@@ -1687,6 +1772,15 @@ xfs_remount_ro( return error; }
+ /* + * Stop the inodegc background worker. xfs_fs_reconfigure already + * flushed all pending inodegc work when it sync'd the filesystem. + * The VFS holds s_umount, so we know that inodes cannot enter + * xfs_fs_destroy_inode during a remount operation. In readonly mode + * we send inodes straight to reclaim, so no inodes will be queued. + */ + xfs_inodegc_stop(mp); + /* Free the per-AG metadata reservation pool. */ error = xfs_fs_unreserve_ag_blocks(mp); if (error) { @@ -2087,7 +2181,7 @@ xfs_cpu_dead( spin_lock(&xfs_mount_list_lock); list_for_each_entry_safe(mp, n, &xfs_mount_list, m_mount_list) { spin_unlock(&xfs_mount_list_lock); - /* xfs_subsys_dead(mp, cpu); */ + xfs_inodegc_cpu_dead(mp, cpu); spin_lock(&xfs_mount_list_lock); } spin_unlock(&xfs_mount_list_lock); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 8fa6d8bb4312..bc6bed5f0eae 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -139,6 +139,45 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_put); DEFINE_PERAG_REF_EVENT(xfs_perag_set_inode_tag); DEFINE_PERAG_REF_EVENT(xfs_perag_clear_inode_tag);
+DECLARE_EVENT_CLASS(xfs_fs_class, + TP_PROTO(struct xfs_mount *mp, void *caller_ip), + TP_ARGS(mp, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long long, mflags) + __field(unsigned long, opstate) + __field(unsigned long, sbflags) + __field(void *, caller_ip) + ), + TP_fast_assign( + if (mp) { + __entry->dev = mp->m_super->s_dev; + __entry->mflags = mp->m_flags; + __entry->opstate = mp->m_opstate; + __entry->sbflags = mp->m_super->s_flags; + } + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d m_flags 0x%llx opstate (%s) s_flags 0x%lx caller %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->mflags, + __print_flags(__entry->opstate, "|", XFS_OPSTATE_STRINGS), + __entry->sbflags, + __entry->caller_ip) +); + +#define DEFINE_FS_EVENT(name) \ +DEFINE_EVENT(xfs_fs_class, name, \ + TP_PROTO(struct xfs_mount *mp, void *caller_ip), \ + TP_ARGS(mp, caller_ip)) +DEFINE_FS_EVENT(xfs_inodegc_flush); +DEFINE_FS_EVENT(xfs_inodegc_start); +DEFINE_FS_EVENT(xfs_inodegc_stop); +DEFINE_FS_EVENT(xfs_inodegc_worker); +DEFINE_FS_EVENT(xfs_inodegc_queue); +DEFINE_FS_EVENT(xfs_inodegc_throttle); +DEFINE_FS_EVENT(xfs_fs_sync_fs); + DECLARE_EVENT_CLASS(xfs_ag_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno), TP_ARGS(mp, agno), @@ -598,14 +637,17 @@ DECLARE_EVENT_CLASS(xfs_inode_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) + __field(unsigned long, iflags) ), TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; + __entry->iflags = ip->i_flags; ), - TP_printk("dev %d:%d ino 0x%llx", + TP_printk("dev %d:%d ino 0x%llx iflags 0x%lx", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino) + __entry->ino, + __entry->iflags) )
#define DEFINE_INODE_EVENT(name) \ @@ -649,6 +691,10 @@ DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid); DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag); DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag); DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid); +DEFINE_INODE_EVENT(xfs_inode_set_reclaimable); +DEFINE_INODE_EVENT(xfs_inode_reclaiming); +DEFINE_INODE_EVENT(xfs_inode_set_need_inactive); +DEFINE_INODE_EVENT(xfs_inode_inactivating);
/* * ftrace's __print_symbolic requires that all enum values be wrapped in the
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.14-rc4 commit 7d6f07d2c5ad9fce298889eeed317d512a2df8cd category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Now that we have made the inactivation of unlinked inodes a background task to increase the throughput of file deletions, we need to be a little more careful about how long of a delay we can tolerate.
On a mostly empty filesystem, the risk of the allocator making poor decisions due to fragmentation of the free space on account a lengthy delay in background updates is minimal because there's plenty of space. However, if free space is tight, we want to deallocate unlinked inodes as quickly as possible to avoid fallocate ENOSPC and to give the allocator the best shot at optimal allocations for new writes.
Therefore, queue the percpu worker immediately if the filesystem is more than 95% full. This follows the same principle that XFS becomes less aggressive about speculative allocations and lazy cleanup (and more precise about accounting) when nearing full.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 6 ++++++ fs/xfs/xfs_mount.c | 8 -------- fs/xfs/xfs_mount.h | 9 +++++++++ 3 files changed, 15 insertions(+), 8 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index eb27e11d67f1..a7e53a526e6d 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1928,6 +1928,7 @@ xfs_inodegc_start( * Schedule the inactivation worker when: * * - We've accumulated more than one inode cluster buffer's worth of inodes. + * - There is less than 5% free space left. */ static inline bool xfs_inodegc_want_queue_work( @@ -1939,6 +1940,11 @@ xfs_inodegc_want_queue_work( if (items > mp->m_ino_geo.inodes_per_cluster) return true;
+ if (__percpu_counter_compare(&mp->m_fdblocks, + mp->m_low_space[XFS_LOWSP_5_PCNT], + XFS_FDBLOCKS_BATCH) < 0) + return true; + return false; }
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 4b2644c76f59..ba74d0e3711e 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1221,14 +1221,6 @@ xfs_log_sbcount(xfs_mount_t *mp) return xfs_sync_sb(mp, true); }
-/* - * Deltas for the block count can vary from 1 to very large, but lock contention - * only occurs on frequent small block count updates such as in the delayed - * allocation path for buffered writes (page a time updates). Hence we set - * a large batch count (1024) to minimise global counter updates except when - * we get near to ENOSPC and we have to be very accurate with our updates. - */ -#define XFS_FDBLOCKS_BATCH 1024 int xfs_mod_fdblocks( struct xfs_mount *mp, diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 85109bbb3bff..8e0589cd3636 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -447,6 +447,15 @@ extern int xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount, xfs_agnumber_t *maxagi); extern void xfs_unmountfs(xfs_mount_t *);
+/* + * Deltas for the block count can vary from 1 to very large, but lock contention + * only occurs on frequent small block count updates such as in the delayed + * allocation path for buffered writes (page a time updates). Hence we set + * a large batch count (1024) to minimise global counter updates except when + * we get near to ENOSPC and we have to be very accurate with our updates. + */ +#define XFS_FDBLOCKS_BATCH 1024 + extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved); extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.14-rc4 commit 108523b8de676a45cef1f6c8566c444222b85de0 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Now that we have made the inactivation of unlinked inodes a background task to increase the throughput of file deletions, we need to be a little more careful about how long of a delay we can tolerate.
Specifically, if the dquots attached to the inode being inactivated are nearing any kind of enforcement boundary, we want to queue that inactivation work immediately so that users don't get EDQUOT/ENOSPC errors even after they deleted a bunch of files to stay within quota.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_dquot.h | 10 ++++++++++ fs/xfs/xfs_icache.c | 10 ++++++++++ fs/xfs/xfs_qm.c | 34 ++++++++++++++++++++++++++++++++++ fs/xfs/xfs_quota.h | 2 ++ 4 files changed, 56 insertions(+)
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index f642884a6834..6b5e3cf40c8b 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -54,6 +54,16 @@ struct xfs_dquot_res { xfs_qwarncnt_t warnings; };
+static inline bool +xfs_dquot_res_over_limits( + const struct xfs_dquot_res *qres) +{ + if ((qres->softlimit && qres->softlimit < qres->reserved) || + (qres->hardlimit && qres->hardlimit < qres->reserved)) + return true; + return false; +} + /* * The incore dquot structure */ diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index a7e53a526e6d..5ee7306a7291 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1929,6 +1929,7 @@ xfs_inodegc_start( * * - We've accumulated more than one inode cluster buffer's worth of inodes. * - There is less than 5% free space left. + * - Any of the quotas for this inode are near an enforcement limit. */ static inline bool xfs_inodegc_want_queue_work( @@ -1945,6 +1946,15 @@ xfs_inodegc_want_queue_work( XFS_FDBLOCKS_BATCH) < 0) return true;
+ if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_USER)) + return true; + + if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_GROUP)) + return true; + + if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_PROJ)) + return true; + return false; }
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 601eec773bb2..926d262c1236 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1877,3 +1877,37 @@ xfs_qm_vop_create_dqattach( } }
+/* Decide if this inode's dquot is near an enforcement boundary. */ +bool +xfs_inode_near_dquot_enforcement( + struct xfs_inode *ip, + xfs_dqtype_t type) +{ + struct xfs_dquot *dqp; + int64_t freesp; + + /* We only care for quotas that are enabled and enforced. */ + dqp = xfs_inode_dquot(ip, type); + if (!dqp || !xfs_dquot_is_enforced(dqp)) + return false; + + if (xfs_dquot_res_over_limits(&dqp->q_ino) || + xfs_dquot_res_over_limits(&dqp->q_rtb)) + return true; + + /* For space on the data device, check the various thresholds. */ + if (!dqp->q_prealloc_hi_wmark) + return false; + + if (dqp->q_blk.reserved < dqp->q_prealloc_lo_wmark) + return false; + + if (dqp->q_blk.reserved >= dqp->q_prealloc_hi_wmark) + return true; + + freesp = dqp->q_prealloc_hi_wmark - dqp->q_blk.reserved; + if (freesp < dqp->q_low_space[XFS_QLOWSP_5_PCNT]) + return true; + + return false; +} diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index d00d01302545..dcc785fdd345 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -113,6 +113,7 @@ xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks) { return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, false); } +bool xfs_inode_near_dquot_enforcement(struct xfs_inode *ip, xfs_dqtype_t type); #else static inline int xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid, @@ -168,6 +169,7 @@ xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp, #define xfs_qm_mount_quotas(mp) #define xfs_qm_unmount(mp) #define xfs_qm_unmount_quotas(mp) +#define xfs_inode_near_dquot_enforcement(ip, type) (false) #endif /* CONFIG_XFS_QUOTA */
static inline int
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.14-rc4 commit 65f03d8652b240aa66b99a07e3c423a51e967568 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Now that we have made the inactivation of unlinked inodes a background task to increase the throughput of file deletions, we need to be a little more careful about how long of a delay we can tolerate.
Similar to the patch doing this for free space on the data device, if the file being inactivated is a realtime file and the realtime volume is running low on free extents, we want to run the worker ASAP so that the realtime allocator can make better decisions.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 21 +++++++++++++++++++++ fs/xfs/xfs_mount.c | 13 ++++++++----- fs/xfs/xfs_mount.h | 3 ++- 3 files changed, 31 insertions(+), 6 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 5ee7306a7291..949f5f8ac6af 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1924,6 +1924,24 @@ xfs_inodegc_start( xfs_inodegc_queue_all(mp); }
+#ifdef CONFIG_XFS_RT +static inline bool +xfs_inodegc_want_queue_rt_file( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + uint64_t freertx; + + if (!XFS_IS_REALTIME_INODE(ip)) + return false; + + freertx = READ_ONCE(mp->m_sb.sb_frextents); + return freertx < mp->m_low_rtexts[XFS_LOWSP_5_PCNT]; +} +#else +# define xfs_inodegc_want_queue_rt_file(ip) (false) +#endif /* CONFIG_XFS_RT */ + /* * Schedule the inactivation worker when: * @@ -1946,6 +1964,9 @@ xfs_inodegc_want_queue_work( XFS_FDBLOCKS_BATCH) < 0) return true;
+ if (xfs_inodegc_want_queue_rt_file(ip)) + return true; + if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_USER)) return true;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index ba74d0e3711e..b9be1554aeb1 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -489,13 +489,16 @@ void xfs_set_low_space_thresholds( struct xfs_mount *mp) { - int i; + uint64_t dblocks = mp->m_sb.sb_dblocks; + uint64_t rtexts = mp->m_sb.sb_rextents; + int i;
- for (i = 0; i < XFS_LOWSP_MAX; i++) { - uint64_t space = mp->m_sb.sb_dblocks; + do_div(dblocks, 100); + do_div(rtexts, 100);
- do_div(space, 100); - mp->m_low_space[i] = space * (i + 1); + for (i = 0; i < XFS_LOWSP_MAX; i++) { + mp->m_low_space[i] = dblocks * (i + 1); + mp->m_low_rtexts[i] = rtexts * (i + 1); } }
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 8e0589cd3636..5d3762982725 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -146,7 +146,8 @@ typedef struct xfs_mount { int m_fixedfsid[2]; /* unchanged for life of FS */ uint m_qflags; /* quota status flags */ uint64_t m_flags; /* global mount flags */ - int64_t m_low_space[XFS_LOWSP_MAX]; + uint64_t m_low_space[XFS_LOWSP_MAX]; + uint64_t m_low_rtexts[XFS_LOWSP_MAX]; struct xfs_ino_geometry m_ino_geo; /* inode geometry */ struct xfs_trans_resv m_resv; /* precomputed res values */ /* low free space thresholds */
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.14-rc4 commit 2eb665027b6528c1a8e9158c2f722a6ec0af359d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Other parts of XFS have learned to call xfs_blockgc_free_{space,quota} to try to free speculative preallocations when space is tight. This means that file writes, transaction reservation failures, quota limit enforcement, and the EOFBLOCKS ioctl all call this function to free space when things are tight.
Since inode inactivation is now a background task, this means that the filesystem can be hanging on to unlinked but not yet freed space. Add this to the list of things that xfs_blockgc_free_* makes writer threads scan for when they cannot reserve space.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 949f5f8ac6af..893118e73de0 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1476,16 +1476,24 @@ xfs_blockgc_worker( }
/* - * Try to free space in the filesystem by purging eofblocks and cowblocks. + * Try to free space in the filesystem by purging inactive inodes, eofblocks + * and cowblocks. */ int xfs_blockgc_free_space( struct xfs_mount *mp, struct xfs_icwalk *icw) { + int error; + trace_xfs_blockgc_free_space(mp, icw, _RET_IP_);
- return xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, icw); + error = xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, icw); + if (error) + return error; + + xfs_inodegc_flush(mp); + return 0; }
/*
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.14-rc4 commit 01e8f379a4895a9a173391408db4fb49ec91e148 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
mainline-inclusion from mainline-v5.14-rc4 commit 01e8f379a4895a9a173391408db4fb49ec91e148 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Users have come to expect that the space accounting information in statfs and getquota reports are fairly accurate. Now that we inactivate inodes from a background queue, these numbers can be thrown off by whatever resources are singly-owned by the inodes in the queue. Flush the pending inactivations when userspace asks for a space usage report.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_qm_syscalls.c | 8 ++++++++ fs/xfs/xfs_super.c | 3 +++ 2 files changed, 11 insertions(+)
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 7e7a7bead8c6..b541dc3fac17 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -481,6 +481,10 @@ xfs_qm_scall_getquota( struct xfs_dquot *dqp; int error;
+ /* Flush inodegc work at the start of a quota reporting scan. */ + if (id == 0) + xfs_inodegc_flush(mp); + /* * Try to get the dquot. We don't want it allocated on disk, so don't * set doalloc. If it doesn't exist, we'll get ENOENT back. @@ -519,6 +523,10 @@ xfs_qm_scall_getquota_next( struct xfs_dquot *dqp; int error;
+ /* Flush inodegc work at the start of a quota reporting scan. */ + if (*id == 0) + xfs_inodegc_flush(mp); + error = xfs_qm_dqget_next(mp, *id, type, &dqp); if (error) return error; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 11557d234e23..23ff5bd0191a 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -791,6 +791,9 @@ xfs_fs_statfs( xfs_extlen_t lsize; int64_t ffree;
+ /* Wait for whatever inactivations are in progress. */ + xfs_inodegc_flush(mp); + statp->f_type = XFS_SUPER_MAGIC; statp->f_namelen = MAXNAMELEN - 1;
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.14-rc4 commit 6f6490914d9b712004ddad648e47b1bf22647978 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Now that we have the infrastructure to switch background workers on and off at will, fix the block gc worker code so that we don't actually run the worker when the filesystem is frozen, same as we do for deferred inactivation.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/scrub/common.c | 9 +++++---- fs/xfs/xfs_icache.c | 18 +++++++++++++++--- fs/xfs/xfs_mount.c | 1 + fs/xfs/xfs_mount.h | 10 +++++++++- fs/xfs/xfs_super.c | 21 ++++++++++++++------- fs/xfs/xfs_trace.h | 3 +++ 6 files changed, 47 insertions(+), 15 deletions(-)
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 5743cf6f8b18..63ea31f9ade3 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -914,11 +914,12 @@ xchk_start_reaping( struct xfs_scrub *sc) { /* - * Readonly filesystems do not perform inactivation, so there's no - * need to restart the worker. + * Readonly filesystems do not perform inactivation or speculative + * preallocation, so there's no need to restart the workers. */ - if (!(sc->mp->m_flags & XFS_MOUNT_RDONLY)) + if (!(sc->mp->m_flags & XFS_MOUNT_RDONLY)) { xfs_inodegc_start(sc->mp); - xfs_blockgc_start(sc->mp); + xfs_blockgc_start(sc->mp); + } sc->flags &= ~XCHK_REAPING_DISABLED; } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 893118e73de0..438a7a2d4bc9 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -210,6 +210,11 @@ static inline void xfs_blockgc_queue( struct xfs_perag *pag) { + struct xfs_mount *mp = pag->pag_mount; + + if (!xfs_is_blockgc_enabled(mp)) + return; + rcu_read_lock(); if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) queue_delayed_work(pag->pag_mount->m_blockgc_wq, @@ -1374,8 +1379,12 @@ xfs_blockgc_stop( struct xfs_perag *pag; xfs_agnumber_t agno;
+ if (!xfs_clear_blockgc_enabled(mp)) + return; + for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) cancel_delayed_work_sync(&pag->pag_blockgc_work); + trace_xfs_blockgc_stop(mp, __return_address); }
/* Enable post-EOF and CoW block auto-reclamation. */ @@ -1386,6 +1395,10 @@ xfs_blockgc_start( struct xfs_perag *pag; xfs_agnumber_t agno;
+ if (xfs_set_blockgc_enabled(mp)) + return; + + trace_xfs_blockgc_start(mp, __return_address); for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) xfs_blockgc_queue(pag); } @@ -1465,13 +1478,12 @@ xfs_blockgc_worker( struct xfs_mount *mp = pag->pag_mount; int error;
- if (!sb_start_write_trylock(mp->m_super)) - return; + trace_xfs_blockgc_worker(mp, __return_address); + error = xfs_icwalk_ag(pag, XFS_ICWALK_BLOCKGC, NULL); if (error) xfs_info(mp, "AG %u preallocation gc worker failed, err=%d", pag->pag_agno, error); - sb_end_write(mp->m_super); xfs_blockgc_queue(pag); }
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index b9be1554aeb1..1539d8d49c09 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -901,6 +901,7 @@ xfs_mountfs(
/* Enable background inode inactivation workers. */ xfs_inodegc_start(mp); + xfs_blockgc_start(mp);
/* * Get and sanity-check the root inode. diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 5d3762982725..805c04a4b278 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -274,6 +274,12 @@ typedef struct xfs_mount { * processed. */ #define XFS_OPSTATE_INODEGC_ENABLED 0 +/* + * If set, background speculative prealloc gc worker threads will be scheduled + * to process queued blockgc work. If not, inodes retain their preallocations + * until explicitly deleted. + */ +#define XFS_OPSTATE_BLOCKGC_ENABLED 1
#define __XFS_IS_OPSTATE(name, NAME) \ static inline bool xfs_is_ ## name (struct xfs_mount *mp) \ @@ -290,9 +296,11 @@ static inline bool xfs_set_ ## name (struct xfs_mount *mp) \ }
__XFS_IS_OPSTATE(inodegc_enabled, INODEGC_ENABLED) +__XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED)
#define XFS_OPSTATE_STRINGS \ - { (1UL << XFS_OPSTATE_INODEGC_ENABLED), "inodegc" } + { (1UL << XFS_OPSTATE_INODEGC_ENABLED), "inodegc" }, \ + { (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" }
/* * Max and min values for mount-option defined I/O diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 23ff5bd0191a..0c59b538bab5 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -763,15 +763,18 @@ xfs_fs_sync_fs( * down inodegc because once SB_FREEZE_FS is set it's too late to * prevent inactivation races with freeze. The fs doesn't get called * again by the freezing process until after SB_FREEZE_FS has been set, - * so it's now or never. + * so it's now or never. Same logic applies to speculative allocation + * garbage collection. * * We don't care if this is a normal syncfs call that does this or * freeze that does this - we can run this multiple times without issue * and we won't race with a restart because a restart can only occur * when the state is either SB_FREEZE_FS or SB_FREEZE_COMPLETE. */ - if (sb->s_writers.frozen == SB_FREEZE_PAGEFAULT) + if (sb->s_writers.frozen == SB_FREEZE_PAGEFAULT) { xfs_inodegc_stop(mp); + xfs_blockgc_stop(mp); + }
return 0; } @@ -922,7 +925,6 @@ xfs_fs_freeze( * set a GFP_NOFS context here to avoid recursion deadlocks. */ flags = memalloc_nofs_save(); - xfs_blockgc_stop(mp); xfs_save_resvblks(mp); xfs_quiesce_attr(mp); ret = xfs_sync_sb(mp, true); @@ -935,8 +937,10 @@ xfs_fs_freeze( * here, so we can restart safely without racing with a stop in * xfs_fs_sync_fs(). */ - if (ret && !(mp->m_flags & XFS_MOUNT_RDONLY)) + if (ret && !(mp->m_flags & XFS_MOUNT_RDONLY)) { + xfs_blockgc_start(mp); xfs_inodegc_start(mp); + }
return ret; } @@ -949,14 +953,17 @@ xfs_fs_unfreeze(
xfs_restore_resvblks(mp); xfs_log_work_queue(mp); - xfs_blockgc_start(mp);
/* * Don't reactivate the inodegc worker on a readonly filesystem because - * inodes are sent directly to reclaim. + * inodes are sent directly to reclaim. Don't reactivate the blockgc + * worker because there are no speculative preallocations on a readonly + * filesystem. */ - if (!(mp->m_flags & XFS_MOUNT_RDONLY)) + if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { + xfs_blockgc_start(mp); xfs_inodegc_start(mp); + }
return 0; } diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index bc6bed5f0eae..4a3b122d62b5 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -177,6 +177,9 @@ DEFINE_FS_EVENT(xfs_inodegc_worker); DEFINE_FS_EVENT(xfs_inodegc_queue); DEFINE_FS_EVENT(xfs_inodegc_throttle); DEFINE_FS_EVENT(xfs_fs_sync_fs); +DEFINE_FS_EVENT(xfs_blockgc_start); +DEFINE_FS_EVENT(xfs_blockgc_stop); +DEFINE_FS_EVENT(xfs_blockgc_worker);
DECLARE_EVENT_CLASS(xfs_ag_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno),
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.14-rc4 commit e8d04c2abcebd66bdbacd53bb273d824d4e27080 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
In xfs_trans_alloc, if the block reservation call returns ENOSPC, we call xfs_blockgc_free_space with a NULL icwalk structure to try to free space. Each frontend thread that encounters this situation starts its own walk of the inode cache to see if it can find anything, which is wasteful since we don't have any additional selection criteria. For this one common case, create a function that reschedules all pending background work immediately and flushes the workqueue so that the scan can run in parallel.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 28 ++++++++++++++++++++++++++++ fs/xfs/xfs_icache.h | 1 + fs/xfs/xfs_trace.h | 1 + fs/xfs/xfs_trans.c | 5 +---- 4 files changed, 31 insertions(+), 4 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 438a7a2d4bc9..17dc512b98df 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1508,6 +1508,34 @@ xfs_blockgc_free_space( return 0; }
+/* + * Reclaim all the free space that we can by scheduling the background blockgc + * and inodegc workers immediately and waiting for them all to clear. + */ +void +xfs_blockgc_flush_all( + struct xfs_mount *mp) +{ + struct xfs_perag *pag; + xfs_agnumber_t agno; + + trace_xfs_blockgc_flush_all(mp, __return_address); + + /* + * For each blockgc worker, move its queue time up to now. If it + * wasn't queued, it will not be requeued. Then flush whatever's + * left. + */ + for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) + mod_delayed_work(pag->pag_mount->m_blockgc_wq, + &pag->pag_blockgc_work, 0); + + for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) + flush_delayed_work(&pag->pag_blockgc_work); + + xfs_inodegc_flush(mp); +} + /* * Run cow/eofblocks scans on the supplied dquots. We don't know exactly which * quota caused an allocation failure, so we make a best effort by including diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index b48a368c43e5..8198e84747c9 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -59,6 +59,7 @@ int xfs_blockgc_free_dquots(struct xfs_mount *mp, struct xfs_dquot *udqp, unsigned int iwalk_flags); int xfs_blockgc_free_quota(struct xfs_inode *ip, unsigned int iwalk_flags); int xfs_blockgc_free_space(struct xfs_mount *mp, struct xfs_icwalk *icm); +void xfs_blockgc_flush_all(struct xfs_mount *mp);
void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 4a3b122d62b5..bb153a91ef70 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -180,6 +180,7 @@ DEFINE_FS_EVENT(xfs_fs_sync_fs); DEFINE_FS_EVENT(xfs_blockgc_start); DEFINE_FS_EVENT(xfs_blockgc_stop); DEFINE_FS_EVENT(xfs_blockgc_worker); +DEFINE_FS_EVENT(xfs_blockgc_flush_all);
DECLARE_EVENT_CLASS(xfs_ag_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno), diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index b2d74a22b231..ac2a1eb74a4f 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -300,10 +300,7 @@ xfs_trans_alloc( * Do not perform a synchronous scan because callers can hold * other locks. */ - error = xfs_blockgc_free_space(mp, NULL); - if (error) - return error; - + xfs_blockgc_flush_all(mp); want_retry = false; goto retry; }
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.14-rc4 commit a6343e4d9278b3919c809fab9945c4d8f04fadf5 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
When we're servicing an INUMBERS or BULKSTAT request or running quotacheck, grab an empty transaction so that we can use its inherent recursive buffer locking abilities to detect inode btree cycles without hitting ABBA buffer deadlocks. This patch requires the deferred inode inactivation patchset because xfs_irele cannot directly call xfs_inactive when the iwalk itself has an (empty) transaction.
Found by fuzzing an inode btree pointer to introduce a cycle into the tree (xfs/365).
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_itable.c | 40 ++++++++++++++++++++++++++++++++++++---- fs/xfs/xfs_iwalk.c | 33 ++++++++++++++++++++++++++++----- 2 files changed, 64 insertions(+), 9 deletions(-)
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 16ca97a7ff00..3a1e45b64bfa 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -19,6 +19,7 @@ #include "xfs_error.h" #include "xfs_icache.h" #include "xfs_health.h" +#include "xfs_trans.h"
/* * Bulk Stat @@ -164,6 +165,7 @@ xfs_bulkstat_one( .formatter = formatter, .breq = breq, }; + struct xfs_trans *tp; int error;
ASSERT(breq->icount == 1); @@ -173,8 +175,17 @@ xfs_bulkstat_one( if (!bc.buf) return -ENOMEM;
- error = xfs_bulkstat_one_int(breq->mp, NULL, breq->startino, &bc); + /* + * Grab an empty transaction so that we can use its recursive buffer + * locking abilities to detect cycles in the inobt without deadlocking. + */ + error = xfs_trans_alloc_empty(breq->mp, &tp); + if (error) + goto out;
+ error = xfs_bulkstat_one_int(breq->mp, tp, breq->startino, &bc); + xfs_trans_cancel(tp); +out: kmem_free(bc.buf);
/* @@ -237,6 +248,7 @@ xfs_bulkstat( .formatter = formatter, .breq = breq, }; + struct xfs_trans *tp; int error;
if (xfs_bulkstat_already_done(breq->mp, breq->startino)) @@ -247,9 +259,18 @@ xfs_bulkstat( if (!bc.buf) return -ENOMEM;
- error = xfs_iwalk(breq->mp, NULL, breq->startino, breq->flags, - xfs_bulkstat_iwalk, breq->icount, &bc); + /* + * Grab an empty transaction so that we can use its recursive buffer + * locking abilities to detect cycles in the inobt without deadlocking. + */ + error = xfs_trans_alloc_empty(breq->mp, &tp); + if (error) + goto out;
+ error = xfs_iwalk(breq->mp, tp, breq->startino, breq->flags, + xfs_bulkstat_iwalk, breq->icount, &bc); + xfs_trans_cancel(tp); +out: kmem_free(bc.buf);
/* @@ -362,13 +383,24 @@ xfs_inumbers( .formatter = formatter, .breq = breq, }; + struct xfs_trans *tp; int error = 0;
if (xfs_bulkstat_already_done(breq->mp, breq->startino)) return 0;
- error = xfs_inobt_walk(breq->mp, NULL, breq->startino, breq->flags, + /* + * Grab an empty transaction so that we can use its recursive buffer + * locking abilities to detect cycles in the inobt without deadlocking. + */ + error = xfs_trans_alloc_empty(breq->mp, &tp); + if (error) + goto out; + + error = xfs_inobt_walk(breq->mp, tp, breq->startino, breq->flags, xfs_inumbers_walk, breq->icount, &ic); + xfs_trans_cancel(tp); +out:
/* * We found some inode groups, so clear the error status and return diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index 708874851dbd..c06702c7e64b 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -81,6 +81,9 @@ struct xfs_iwalk_ag {
/* Skip empty inobt records? */ unsigned int skip_empty:1; + + /* Drop the (hopefully empty) transaction when calling iwalk_fn. */ + unsigned int drop_trans:1; };
/* @@ -351,7 +354,6 @@ xfs_iwalk_run_callbacks( int *has_more) { struct xfs_mount *mp = iwag->mp; - struct xfs_trans *tp = iwag->tp; struct xfs_inobt_rec_incore *irec; xfs_agino_t next_agino; int error; @@ -361,10 +363,15 @@ xfs_iwalk_run_callbacks( ASSERT(iwag->nr_recs > 0);
/* Delete cursor but remember the last record we cached... */ - xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0); + xfs_iwalk_del_inobt(iwag->tp, curpp, agi_bpp, 0); irec = &iwag->recs[iwag->nr_recs - 1]; ASSERT(next_agino == irec->ir_startino + XFS_INODES_PER_CHUNK);
+ if (iwag->drop_trans) { + xfs_trans_cancel(iwag->tp); + iwag->tp = NULL; + } + error = xfs_iwalk_ag_recs(iwag); if (error) return error; @@ -375,8 +382,15 @@ xfs_iwalk_run_callbacks( if (!has_more) return 0;
+ if (iwag->drop_trans) { + error = xfs_trans_alloc_empty(mp, &iwag->tp); + if (error) + return error; + } + /* ...and recreate the cursor just past where we left off. */ - error = xfs_inobt_cur(mp, tp, agno, XFS_BTNUM_INO, curpp, agi_bpp); + error = xfs_inobt_cur(mp, iwag->tp, agno, XFS_BTNUM_INO, curpp, + agi_bpp); if (error) return error;
@@ -389,7 +403,6 @@ xfs_iwalk_ag( struct xfs_iwalk_ag *iwag) { struct xfs_mount *mp = iwag->mp; - struct xfs_trans *tp = iwag->tp; struct xfs_buf *agi_bp = NULL; struct xfs_btree_cur *cur = NULL; xfs_agnumber_t agno; @@ -469,7 +482,7 @@ xfs_iwalk_ag( error = xfs_iwalk_run_callbacks(iwag, agno, &cur, &agi_bp, &has_more);
out: - xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error); + xfs_iwalk_del_inobt(iwag->tp, &cur, &agi_bp, error); return error; }
@@ -594,8 +607,18 @@ xfs_iwalk_ag_work( error = xfs_iwalk_alloc(iwag); if (error) goto out; + /* + * Grab an empty transaction so that we can use its recursive buffer + * locking abilities to detect cycles in the inobt without deadlocking. + */ + error = xfs_trans_alloc_empty(mp, &iwag->tp); + if (error) + goto out; + iwag->drop_trans = 1;
error = xfs_iwalk_ag(iwag); + if (iwag->tp) + xfs_trans_cancel(iwag->tp); xfs_iwalk_free(iwag); out: kmem_free(iwag);
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.14-rc4 commit 40b1de007aca4f9ec4ee4322c29f026ebb60ac96 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Now that we defer inode inactivation, we've decoupled the process of unlinking or closing an inode from the process of inactivating it. In theory this should lead to better throughput since we now inactivate the queued inodes in batches instead of one at a time.
Unfortunately, one of the primary risks with this decoupling is the loss of rate control feedback between the frontend and background threads. In other words, a rm -rf /* thread can run the system out of memory if it can queue inodes for inactivation and jump to a new CPU faster than the background threads can actually clear the deferred work. The workers can get scheduled off the CPU if they have to do IO, etc.
To solve this problem, we configure a shrinker so that it will activate the /second/ time the shrinkers are called. The custom shrinker will queue all percpu deferred inactivation workers immediately and set a flag to force frontend callers who are releasing a vfs inode to wait for the inactivation workers.
On my test VM with 560M of RAM and a 2TB filesystem, this seems to solve most of the OOMing problem when deleting 10 million inodes.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 102 ++++++++++++++++++++++++++++++++++++++++++-- fs/xfs/xfs_icache.h | 1 + fs/xfs/xfs_mount.c | 9 +++- fs/xfs/xfs_mount.h | 3 ++ fs/xfs/xfs_trace.h | 37 +++++++++++++++- 5 files changed, 147 insertions(+), 5 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 17dc512b98df..986d087df226 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1901,8 +1901,9 @@ xfs_inodegc_worker( return;
ip = llist_entry(node, struct xfs_inode, i_gclist); - trace_xfs_inodegc_worker(ip->i_mount, __return_address); + trace_xfs_inodegc_worker(ip->i_mount, READ_ONCE(gc->shrinker_hits));
+ WRITE_ONCE(gc->shrinker_hits, 0); llist_for_each_entry_safe(ip, n, node, i_gclist) { xfs_iflags_set(ip, XFS_INACTIVATING); xfs_inodegc_inactivate(ip); @@ -2036,6 +2037,7 @@ xfs_inodegc_want_queue_work( /* * Make the frontend wait for inactivations when: * + * - Memory shrinkers queued the inactivation worker and it hasn't finished. * - The queue depth exceeds the maximum allowable percpu backlog. * * Note: If the current thread is running a transaction, we don't ever want to @@ -2044,11 +2046,15 @@ xfs_inodegc_want_queue_work( static inline bool xfs_inodegc_want_flush_work( struct xfs_inode *ip, - unsigned int items) + unsigned int items, + unsigned int shrinker_hits) { if (current->journal_info) return false;
+ if (shrinker_hits > 0) + return true; + if (items > XFS_INODEGC_MAX_BACKLOG) return true;
@@ -2067,6 +2073,7 @@ xfs_inodegc_queue( struct xfs_mount *mp = ip->i_mount; struct xfs_inodegc *gc; int items; + unsigned int shrinker_hits;
trace_xfs_inode_set_need_inactive(ip); spin_lock(&ip->i_flags_lock); @@ -2077,6 +2084,7 @@ xfs_inodegc_queue( llist_add(&ip->i_gclist, &gc->list); items = READ_ONCE(gc->items); WRITE_ONCE(gc->items, items + 1); + shrinker_hits = READ_ONCE(gc->shrinker_hits); put_cpu_ptr(gc);
if (!xfs_is_inodegc_enabled(mp)) @@ -2087,7 +2095,7 @@ xfs_inodegc_queue( queue_work(mp->m_inodegc_wq, &gc->work); }
- if (xfs_inodegc_want_flush_work(ip, items)) { + if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) { trace_xfs_inodegc_throttle(mp, __return_address); flush_work(&gc->work); } @@ -2167,3 +2175,91 @@ xfs_inode_mark_reclaimable( xfs_qm_dqdetach(ip); xfs_inodegc_set_reclaimable(ip); } + +/* + * Register a phony shrinker so that we can run background inodegc sooner when + * there's memory pressure. Inactivation does not itself free any memory but + * it does make inodes reclaimable, which eventually frees memory. + * + * The count function, seek value, and batch value are crafted to trigger the + * scan function during the second round of scanning. Hopefully this means + * that we reclaimed enough memory that initiating metadata transactions won't + * make things worse. + */ +#define XFS_INODEGC_SHRINKER_COUNT (1UL << DEF_PRIORITY) +#define XFS_INODEGC_SHRINKER_BATCH ((XFS_INODEGC_SHRINKER_COUNT / 2) + 1) + +static unsigned long +xfs_inodegc_shrinker_count( + struct shrinker *shrink, + struct shrink_control *sc) +{ + struct xfs_mount *mp = container_of(shrink, struct xfs_mount, + m_inodegc_shrinker); + struct xfs_inodegc *gc; + int cpu; + + if (!xfs_is_inodegc_enabled(mp)) + return 0; + + for_each_online_cpu(cpu) { + gc = per_cpu_ptr(mp->m_inodegc, cpu); + if (!llist_empty(&gc->list)) + return XFS_INODEGC_SHRINKER_COUNT; + } + + return 0; +} + +static unsigned long +xfs_inodegc_shrinker_scan( + struct shrinker *shrink, + struct shrink_control *sc) +{ + struct xfs_mount *mp = container_of(shrink, struct xfs_mount, + m_inodegc_shrinker); + struct xfs_inodegc *gc; + int cpu; + bool no_items = true; + + if (!xfs_is_inodegc_enabled(mp)) + return SHRINK_STOP; + + trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address); + + for_each_online_cpu(cpu) { + gc = per_cpu_ptr(mp->m_inodegc, cpu); + if (!llist_empty(&gc->list)) { + unsigned int h = READ_ONCE(gc->shrinker_hits); + + WRITE_ONCE(gc->shrinker_hits, h + 1); + queue_work_on(cpu, mp->m_inodegc_wq, &gc->work); + no_items = false; + } + } + + /* + * If there are no inodes to inactivate, we don't want the shrinker + * to think there's deferred work to call us back about. + */ + if (no_items) + return LONG_MAX; + + return SHRINK_STOP; +} + +/* Register a shrinker so we can accelerate inodegc and throttle queuing. */ +int +xfs_inodegc_register_shrinker( + struct xfs_mount *mp) +{ + struct shrinker *shrink = &mp->m_inodegc_shrinker; + + shrink->count_objects = xfs_inodegc_shrinker_count; + shrink->scan_objects = xfs_inodegc_shrinker_scan; + shrink->seeks = 0; + shrink->flags = SHRINKER_NONSLAB; + shrink->batch = XFS_INODEGC_SHRINKER_BATCH; + + return register_shrinker(shrink); +} diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 8198e84747c9..fd4e79722251 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -80,5 +80,6 @@ void xfs_inodegc_flush(struct xfs_mount *mp); void xfs_inodegc_stop(struct xfs_mount *mp); void xfs_inodegc_start(struct xfs_mount *mp); void xfs_inodegc_cpu_dead(struct xfs_mount *mp, unsigned int cpu); +int xfs_inodegc_register_shrinker(struct xfs_mount *mp);
#endif diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 1539d8d49c09..aa85b380568c 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -881,6 +881,10 @@ xfs_mountfs( goto out_free_perag; }
+ error = xfs_inodegc_register_shrinker(mp); + if (error) + goto out_fail_wait; + /* * Log's mount-time initialization. The first part of recovery can place * some items on the AIL, to be handled when recovery is finished or @@ -891,7 +895,7 @@ xfs_mountfs( XFS_FSB_TO_BB(mp, sbp->sb_logblocks)); if (error) { xfs_warn(mp, "log mount failed"); - goto out_fail_wait; + goto out_inodegc_shrinker; }
/* Make sure the summary counts are ok. */ @@ -1086,6 +1090,8 @@ xfs_mountfs( xfs_unmount_flush_inodes(mp); out_log_dealloc: xfs_log_mount_cancel(mp); + out_inodegc_shrinker: + unregister_shrinker(&mp->m_inodegc_shrinker); out_fail_wait: if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) xfs_wait_buftarg(mp->m_logdev_targp); @@ -1172,6 +1178,7 @@ xfs_unmountfs( #if defined(DEBUG) xfs_errortag_clearall(mp); #endif + unregister_shrinker(&mp->m_inodegc_shrinker); xfs_free_perag(mp);
xfs_errortag_del(mp); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 805c04a4b278..b75f0e7b67ef 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -64,6 +64,7 @@ struct xfs_inodegc {
/* approximate count of inodes in the list */ unsigned int items; + unsigned int shrinker_hits; };
/* @@ -203,6 +204,8 @@ typedef struct xfs_mount { xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */ spinlock_t m_agirotor_lock;/* .. and lock protecting it */
+ /* Memory shrinker to throttle and reprioritize inodegc */ + struct shrinker m_inodegc_shrinker; /* * Workqueue item so that we can coalesce multiple inode flush attempts * into a single flush. diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index bb153a91ef70..5201111d24ff 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -139,6 +139,22 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_put); DEFINE_PERAG_REF_EVENT(xfs_perag_set_inode_tag); DEFINE_PERAG_REF_EVENT(xfs_perag_clear_inode_tag);
+TRACE_EVENT(xfs_inodegc_worker, + TP_PROTO(struct xfs_mount *mp, unsigned int shrinker_hits), + TP_ARGS(mp, shrinker_hits), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, shrinker_hits) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->shrinker_hits = shrinker_hits; + ), + TP_printk("dev %d:%d shrinker_hits %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->shrinker_hits) +); + DECLARE_EVENT_CLASS(xfs_fs_class, TP_PROTO(struct xfs_mount *mp, void *caller_ip), TP_ARGS(mp, caller_ip), @@ -173,7 +189,6 @@ DEFINE_EVENT(xfs_fs_class, name, \ DEFINE_FS_EVENT(xfs_inodegc_flush); DEFINE_FS_EVENT(xfs_inodegc_start); DEFINE_FS_EVENT(xfs_inodegc_stop); -DEFINE_FS_EVENT(xfs_inodegc_worker); DEFINE_FS_EVENT(xfs_inodegc_queue); DEFINE_FS_EVENT(xfs_inodegc_throttle); DEFINE_FS_EVENT(xfs_fs_sync_fs); @@ -182,6 +197,26 @@ DEFINE_FS_EVENT(xfs_blockgc_stop); DEFINE_FS_EVENT(xfs_blockgc_worker); DEFINE_FS_EVENT(xfs_blockgc_flush_all);
+TRACE_EVENT(xfs_inodegc_shrinker_scan, + TP_PROTO(struct xfs_mount *mp, struct shrink_control *sc, + void *caller_ip), + TP_ARGS(mp, sc, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long, nr_to_scan) + __field(void *, caller_ip) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->nr_to_scan = sc->nr_to_scan; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d nr_to_scan %lu caller %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->nr_to_scan, + __entry->caller_ip) +); + DECLARE_EVENT_CLASS(xfs_ag_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno), TP_ARGS(mp, agno),
From: Christoph Hellwig hch@lst.de
mainline-inclusion from mainline-v5.11-rc4 commit f22c7f87777361f94aa17f746fbadfa499248dc8 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Factor out the log syncing logic into two helpers to make the code easier to read and more maintainable.
Signed-off-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_file.c | 81 +++++++++++++++++++++++++++++------------------ 1 file changed, 50 insertions(+), 31 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index c6845b503667..407d884bf7e0 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -118,6 +118,54 @@ xfs_dir_fsync( return xfs_log_force_inode(ip); }
+static xfs_csn_t +xfs_fsync_seq( + struct xfs_inode *ip, + bool datasync) +{ + if (!xfs_ipincount(ip)) + return 0; + if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) + return 0; + return ip->i_itemp->ili_commit_seq; +} + +/* + * All metadata updates are logged, which means that we just have to flush the + * log up to the latest LSN that touched the inode. + * + * If we have concurrent fsync/fdatasync() calls, we need them to all block on + * the log force before we clear the ili_fsync_fields field. This ensures that + * we don't get a racing sync operation that does not wait for the metadata to + * hit the journal before returning. If we race with clearing ili_fsync_fields, + * then all that will happen is the log force will do nothing as the lsn will + * already be on disk. We can't race with setting ili_fsync_fields because that + * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock + * shared until after the ili_fsync_fields is cleared. + */ +static int +xfs_fsync_flush_log( + struct xfs_inode *ip, + bool datasync, + int *log_flushed) +{ + int error = 0; + xfs_csn_t seq; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + seq = xfs_fsync_seq(ip, datasync); + if (seq) { + error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, + log_flushed); + + spin_lock(&ip->i_itemp->ili_lock); + ip->i_itemp->ili_fsync_fields = 0; + spin_unlock(&ip->i_itemp->ili_lock); + } + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return error; +} + STATIC int xfs_file_fsync( struct file *file, @@ -125,13 +173,10 @@ xfs_file_fsync( loff_t end, int datasync) { - struct inode *inode = file->f_mapping->host; - struct xfs_inode *ip = XFS_I(inode); - struct xfs_inode_log_item *iip = ip->i_itemp; + struct xfs_inode *ip = XFS_I(file->f_mapping->host); struct xfs_mount *mp = ip->i_mount; int error = 0; int log_flushed = 0; - xfs_csn_t seq = 0;
trace_xfs_file_fsync(ip);
@@ -155,33 +200,7 @@ xfs_file_fsync( else if (mp->m_logdev_targp != mp->m_ddev_targp) xfs_blkdev_issue_flush(mp->m_ddev_targp);
- /* - * All metadata updates are logged, which means that we just have to - * flush the log up to the latest LSN that touched the inode. If we have - * concurrent fsync/fdatasync() calls, we need them to all block on the - * log force before we clear the ili_fsync_fields field. This ensures - * that we don't get a racing sync operation that does not wait for the - * metadata to hit the journal before returning. If we race with - * clearing the ili_fsync_fields, then all that will happen is the log - * force will do nothing as the seq will already be on disk. We can't - * race with setting ili_fsync_fields because that is done under - * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared - * until after the ili_fsync_fields is cleared. - */ - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_ipincount(ip)) { - if (!datasync || - (iip->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) - seq = iip->ili_commit_seq; - } - - if (seq) { - error = xfs_log_force_seq(mp, seq, XFS_LOG_SYNC, &log_flushed); - spin_lock(&iip->ili_lock); - iip->ili_fsync_fields = 0; - spin_unlock(&iip->ili_lock); - } - xfs_iunlock(ip, XFS_ILOCK_SHARED); + error = xfs_fsync_flush_log(ip, datasync, &log_flushed);
/* * If we only have a single device, and the log force about was
From: Christoph Hellwig hch@lst.de
mainline-inclusion from mainline-v5.11-rc4 commit ae29e4220fd3047b5442e7e8db8027d7745093f5 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
If the inode is not pinned by the time fsync is called we don't need the ilock to protect against concurrent clearing of ili_fsync_fields as the inode won't need a log flush or clearing of these fields. Not taking the iolock allows for full concurrency of fsync and thus O_DSYNC completions with io_uring/aio write submissions.
Signed-off-by: Christoph Hellwig hch@lst.de Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_file.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 407d884bf7e0..2fb8cf556f8d 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -200,7 +200,14 @@ xfs_file_fsync( else if (mp->m_logdev_targp != mp->m_ddev_targp) xfs_blkdev_issue_flush(mp->m_ddev_targp);
- error = xfs_fsync_flush_log(ip, datasync, &log_flushed); + /* + * Any inode that has dirty modifications in the log is pinned. The + * racy check here for a pinned inode while not catch modifications + * that happen concurrently to the fsync call, but fsync semantics + * only require to sync previously completed I/O. + */ + if (xfs_ipincount(ip)) + error = xfs_fsync_flush_log(ip, datasync, &log_flushed);
/* * If we only have a single device, and the log force about was
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.14-rc4 commit 0020a190cf3eac16995143db41b21b82bacdcbe3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
The AIL pushing is stalling on log forces when it comes across pinned items. This is happening on removal workloads where the AIL is dominated by stale items that are removed from AIL when the checkpoint that marks the items stale is committed to the journal. This results is relatively few items in the AIL, but those that are are often pinned as directories items are being removed from are still being logged.
As a result, many push cycles through the CIL will first issue a blocking log force to unpin the items. This can take some time to complete, with tracing regularly showing push delays of half a second and sometimes up into the range of several seconds. Sequences like this aren't uncommon:
.... 399.829437: xfsaild: last lsn 0x11002dd000 count 101 stuck 101 flushing 0 tout 20 <wanted 20ms, got 270ms delay> 400.099622: xfsaild: target 0x11002f3600, prev 0x11002f3600, last lsn 0x0 400.099623: xfsaild: first lsn 0x11002f3600 400.099679: xfsaild: last lsn 0x1100305000 count 16 stuck 11 flushing 0 tout 50 <wanted 50ms, got 500ms delay> 400.589348: xfsaild: target 0x110032e600, prev 0x11002f3600, last lsn 0x0 400.589349: xfsaild: first lsn 0x1100305000 400.589595: xfsaild: last lsn 0x110032e600 count 156 stuck 101 flushing 30 tout 50 <wanted 50ms, got 460ms delay> 400.950341: xfsaild: target 0x1100353000, prev 0x110032e600, last lsn 0x0 400.950343: xfsaild: first lsn 0x1100317c00 400.950436: xfsaild: last lsn 0x110033d200 count 105 stuck 101 flushing 0 tout 20 <wanted 20ms, got 200ms delay> 401.142333: xfsaild: target 0x1100361600, prev 0x1100353000, last lsn 0x0 401.142334: xfsaild: first lsn 0x110032e600 401.142535: xfsaild: last lsn 0x1100353000 count 122 stuck 101 flushing 8 tout 10 <wanted 10ms, got 10ms delay> 401.154323: xfsaild: target 0x1100361600, prev 0x1100361600, last lsn 0x1100353000 401.154328: xfsaild: first lsn 0x1100353000 401.154389: xfsaild: last lsn 0x1100353000 count 101 stuck 101 flushing 0 tout 20 <wanted 20ms, got 300ms delay> 401.451525: xfsaild: target 0x1100361600, prev 0x1100361600, last lsn 0x0 401.451526: xfsaild: first lsn 0x1100353000 401.451804: xfsaild: last lsn 0x1100377200 count 170 stuck 22 flushing 122 tout 50 <wanted 50ms, got 500ms delay> 401.933581: xfsaild: target 0x1100361600, prev 0x1100361600, last lsn 0x0 ....
In each of these cases, every AIL pass saw 101 log items stuck on the AIL (pinned) with very few other items being found. Each pass, a log force was issued, and delay between last/first is the sleep time + the sync log force time.
Some of these 101 items pinned the tail of the log. The tail of the log does slowly creep forward (first lsn), but the problem is that the log is actually out of reservation space because it's been running so many transactions that stale items that never reach the AIL but consume log space. Hence we have a largely empty AIL, with long term pins on items that pin the tail of the log that don't get pushed frequently enough to keep log space available.
The problem is the hundreds of milliseconds that we block in the log force pushing the CIL out to disk. The AIL should not be stalled like this - it needs to run and flush items that are at the tail of the log with minimal latency. What we really need to do is trigger a log flush, but then not wait for it at all - we've already done our waiting for stuff to complete when we backed off prior to the log force being issued.
Even if we remove the XFS_LOG_SYNC from the xfs_log_force() call, we still do a blocking flush of the CIL and that is what is causing the issue. Hence we need a new interface for the CIL to trigger an immediate background push of the CIL to get it moving faster but not to wait on that to occur. While the CIL is pushing, the AIL can also be pushing.
We already have an internal interface to do this - xlog_cil_push_now() - but we need a wrapper for it to be used externally. xlog_cil_force_seq() can easily be extended to do what we need as it already implements the synchronous CIL push via xlog_cil_push_now(). Add the necessary flags and "push current sequence" semantics to xlog_cil_force_seq() and convert the AIL pushing to use it.
One of the complexities here is that the CIL push does not guarantee that the commit record for the CIL checkpoint is written to disk. The current log force ensures this by submitting the current ACTIVE iclog that the commit record was written to. We need the CIL to actually write this commit record to disk for an async push to ensure that the checkpoint actually makes it to disk and unpins the pinned items in the checkpoint on completion. Hence we need to pass down to the CIL push that we are doing an async flush so that it can switch out the commit_iclog if necessary to get written to disk when the commit iclog is finally released.
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Allison Henderson allison.henderson@oracle.com Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_log.c | 38 +++++++++++++----------- fs/xfs/xfs_log.h | 1 + fs/xfs/xfs_log_cil.c | 65 ++++++++++++++++++++++++++++++++++++------ fs/xfs/xfs_log_priv.h | 4 +++ fs/xfs/xfs_sysfs.c | 1 + fs/xfs/xfs_trace.c | 1 + fs/xfs/xfs_trans.c | 2 +- fs/xfs/xfs_trans_ail.c | 11 +++++-- 8 files changed, 94 insertions(+), 29 deletions(-)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 205bce9cf6f9..b9f2ad4e8345 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -50,11 +50,6 @@ xlog_state_get_iclog_space( int *continued_write, int *logoffsetp); STATIC void -xlog_state_switch_iclogs( - struct xlog *log, - struct xlog_in_core *iclog, - int eventual_size); -STATIC void xlog_grant_push_ail( struct xlog *log, int need_bytes); @@ -3020,7 +3015,7 @@ xfs_log_ticket_ungrant( * This routine will mark the current iclog in the ring as WANT_SYNC and move * the current iclog pointer to the next iclog in the ring. */ -STATIC void +void xlog_state_switch_iclogs( struct xlog *log, struct xlog_in_core *iclog, @@ -3192,6 +3187,20 @@ xfs_log_force( return -EIO; }
+/* + * Force the log to a specific LSN. + * + * If an iclog with that lsn can be found: + * If it is in the DIRTY state, just return. + * If it is in the ACTIVE state, move the in-core log into the WANT_SYNC + * state and go to sleep or return. + * If it is in any other state, go to sleep or return. + * + * Synchronous forces are implemented with a wait queue. All callers trying + * to force a given lsn to disk must wait on the queue attached to the + * specific in-core log. When given in-core log finally completes its write + * to disk, that thread will wake up all threads waiting on the queue. + */ static int xlog_force_lsn( struct xlog *log, @@ -3277,18 +3286,13 @@ xlog_force_lsn( }
/* - * Force the in-core log to disk for a specific LSN. + * Force the log to a specific checkpoint sequence. * - * Find in-core log with lsn. - * If it is in the DIRTY state, just return. - * If it is in the ACTIVE state, move the in-core log into the WANT_SYNC - * state and go to sleep or return. - * If it is in any other state, go to sleep or return. - * - * Synchronous forces are implemented with a wait queue. All callers trying - * to force a given lsn to disk must wait on the queue attached to the - * specific in-core log. When given in-core log finally completes its write - * to disk, that thread will wake up all threads waiting on the queue. + * First force the CIL so that all the required changes have been flushed to the + * iclogs. If the CIL force completed it will return a commit LSN that indicates + * the iclog that needs to be flushed to stable storage. If the caller needs + * a synchronous log force, we will wait on the iclog with the LSN returned by + * xlog_cil_force_seq() to be completed. */ int xfs_log_force_seq( diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 93fbb8fbbc8c..66bd673344a8 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -104,6 +104,7 @@ struct xlog_ticket; struct xfs_log_item; struct xfs_item_ops; struct xfs_trans; +struct xlog;
int xfs_log_force(struct xfs_mount *mp, uint flags); int xfs_log_force_seq(struct xfs_mount *mp, xfs_csn_t seq, uint flags, diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 4e41130f206f..f31e72f5cd2c 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -667,6 +667,7 @@ xlog_cil_push_work( xfs_csn_t push_seq; struct bio bio; DECLARE_COMPLETION_ONSTACK(bdev_flush); + bool push_commit_stable;
new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_NOFS); new_ctx->ticket = xlog_cil_ticket_alloc(log); @@ -677,6 +678,8 @@ xlog_cil_push_work( spin_lock(&cil->xc_push_lock); push_seq = cil->xc_push_seq; ASSERT(push_seq <= ctx->sequence); + push_commit_stable = cil->xc_push_commit_stable; + cil->xc_push_commit_stable = false;
/* * As we are about to switch to a new, empty CIL context, we no longer @@ -956,9 +959,20 @@ xlog_cil_push_work( * The commit iclog must be written to stable storage to guarantee * journal IO vs metadata writeback IO is correctly ordered on stable * storage. + * + * If the push caller needs the commit to be immediately stable and the + * commit_iclog is not yet marked as XLOG_STATE_WANT_SYNC to indicate it + * will be written when released, switch it's state to WANT_SYNC right + * now. */ commit_iclog->ic_flags |= XLOG_ICL_NEED_FUA; + if (push_commit_stable && + commit_iclog->ic_state == XLOG_STATE_ACTIVE) + xlog_state_switch_iclogs(log, commit_iclog, 0); xlog_state_release_iclog(log, commit_iclog, preflush_tail_lsn); + + /* Not safe to reference ctx now! */ + spin_unlock(&log->l_icloglock); return;
@@ -1042,13 +1056,26 @@ xlog_cil_push_background( /* * xlog_cil_push_now() is used to trigger an immediate CIL push to the sequence * number that is passed. When it returns, the work will be queued for - * @push_seq, but it won't be completed. The caller is expected to do any - * waiting for push_seq to complete if it is required. + * @push_seq, but it won't be completed. + * + * If the caller is performing a synchronous force, we will flush the workqueue + * to get previously queued work moving to minimise the wait time they will + * undergo waiting for all outstanding pushes to complete. The caller is + * expected to do the required waiting for push_seq to complete. + * + * If the caller is performing an async push, we need to ensure that the + * checkpoint is fully flushed out of the iclogs when we finish the push. If we + * don't do this, then the commit record may remain sitting in memory in an + * ACTIVE iclog. This then requires another full log force to push to disk, + * which defeats the purpose of having an async, non-blocking CIL force + * mechanism. Hence in this case we need to pass a flag to the push work to + * indicate it needs to flush the commit record itself. */ static void xlog_cil_push_now( struct xlog *log, - xfs_lsn_t push_seq) + xfs_lsn_t push_seq, + bool async) { struct xfs_cil *cil = log->l_cilp;
@@ -1058,7 +1085,8 @@ xlog_cil_push_now( ASSERT(push_seq && push_seq <= cil->xc_current_sequence);
/* start on any pending background push to minimise wait time on it */ - flush_work(&cil->xc_push_work); + if (!async) + flush_work(&cil->xc_push_work);
/* * If the CIL is empty or we've already pushed the sequence then @@ -1071,6 +1099,7 @@ xlog_cil_push_now( }
cil->xc_push_seq = push_seq; + cil->xc_push_commit_stable = async; queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work); spin_unlock(&cil->xc_push_lock); } @@ -1155,12 +1184,27 @@ xlog_cil_commit( xlog_cil_push_background(log); }
+/* + * Flush the CIL to stable storage but don't wait for it to complete. This + * requires the CIL push to ensure the commit record for the push hits the disk, + * but otherwise is no different to a push done from a log force. + */ +void +xlog_cil_flush( + struct xlog *log) +{ + xfs_csn_t seq = log->l_cilp->xc_current_sequence; + + trace_xfs_log_force(log->l_mp, seq, _RET_IP_); + xlog_cil_push_now(log, seq, true); +} + /* * Conditionally push the CIL based on the sequence passed in. * - * We only need to push if we haven't already pushed the sequence - * number given. Hence the only time we will trigger a push here is - * if the push sequence is the same as the current context. + * We only need to push if we haven't already pushed the sequence number given. + * Hence the only time we will trigger a push here is if the push sequence is + * the same as the current context. * * We return the current commit lsn to allow the callers to determine if a * iclog flush is necessary following this call. @@ -1176,13 +1220,17 @@ xlog_cil_force_seq(
ASSERT(sequence <= cil->xc_current_sequence);
+ if (!sequence) + sequence = cil->xc_current_sequence; + trace_xfs_log_force(log->l_mp, sequence, _RET_IP_); + /* * check to see if we need to force out the current context. * xlog_cil_push() handles racing pushes for the same sequence, * so no need to deal with it here. */ restart: - xlog_cil_push_now(log, sequence); + xlog_cil_push_now(log, sequence, false);
/* * See if we can find a previous sequence still committing. @@ -1206,6 +1254,7 @@ xlog_cil_force_seq( * It is still being pushed! Wait for the push to * complete, then start again from the beginning. */ + XFS_STATS_INC(log->l_mp, xs_log_force_sleep); xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock); goto restart; } diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 7cbde0b4f990..90fee866c613 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -280,6 +280,7 @@ struct xfs_cil {
spinlock_t xc_push_lock ____cacheline_aligned_in_smp; xfs_csn_t xc_push_seq; + bool xc_push_commit_stable; struct list_head xc_committing; wait_queue_head_t xc_commit_wait; xfs_csn_t xc_current_sequence; @@ -497,6 +498,8 @@ int xlog_commit_record(struct xlog *log, struct xlog_ticket *ticket, void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket); void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket);
+void xlog_state_switch_iclogs(struct xlog *log, struct xlog_in_core *iclog, + int eventual_size); int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog, xfs_lsn_t log_tail_lsn);
@@ -568,6 +571,7 @@ void xlog_cil_commit(struct xlog *log, struct xfs_trans *tp, /* * CIL force routines */ +void xlog_cil_flush(struct xlog *log); xfs_lsn_t xlog_cil_force_seq(struct xlog *log, xfs_csn_t sequence);
static inline void diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index f1bc88f4367c..18dc5eca6c04 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -10,6 +10,7 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_sysfs.h" +#include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_mount.h"
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 9b8d703dc9fd..d111a994b7b6 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -20,6 +20,7 @@ #include "xfs_bmap.h" #include "xfs_attr.h" #include "xfs_trans.h" +#include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_buf_item.h" #include "xfs_quota.h" diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index ac2a1eb74a4f..3d1c4a4cb16e 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -9,7 +9,6 @@ #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" -#include "xfs_log_priv.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_extent_busy.h" @@ -17,6 +16,7 @@ #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_log.h" +#include "xfs_log_priv.h" #include "xfs_trace.h" #include "xfs_error.h" #include "xfs_defer.h" diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index dbb69b4bf3ed..69aac416e2ce 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -17,6 +17,7 @@ #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_log.h" +#include "xfs_log_priv.h"
#ifdef DEBUG /* @@ -429,8 +430,12 @@ xfsaild_push(
/* * If we encountered pinned items or did not finish writing out all - * buffers the last time we ran, force the log first and wait for it - * before pushing again. + * buffers the last time we ran, force a background CIL push to get the + * items unpinned in the near future. We do not wait on the CIL push as + * that could stall us for seconds if there is enough background IO + * load. Stalling for that long when the tail of the log is pinned and + * needs flushing will hard stop the transaction subsystem when log + * space runs out. */ if (ailp->ail_log_flush && ailp->ail_last_pushed_lsn == 0 && (!list_empty_careful(&ailp->ail_buf_list) || @@ -438,7 +443,7 @@ xfsaild_push( ailp->ail_log_flush = 0;
XFS_STATS_INC(mp, xs_push_ail_flush); - xfs_log_force(mp, XFS_LOG_SYNC); + xlog_cil_flush(mp->m_log); }
spin_lock(&ailp->ail_lock);
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.14-rc4 commit 39823d0fac9416cb89c252d78e262ee8cd76a7d8 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Because we use a single work structure attached to the CIL rather than the CIL context, we can only queue a single work item at a time. This results in the CIL being single threaded and limits performance when it becomes CPU bound.
The design of the CIL is that it is pipelined and multiple commits can be running concurrently, but the way the work is currently implemented means that it is not pipelining as it was intended. The critical work to switch the CIL context can take a few milliseconds to run, but the rest of the CIL context flush can take hundreds of milliseconds to complete. The context switching is the serialisation point of the CIL, once the context has been switched the rest of the context push can run asynchrnously with all other context pushes.
Hence we can move the work to the CIL context so that we can run multiple CIL pushes at the same time and spread the majority of the work out over multiple CPUs. We can keep the per-cpu CIL commit state on the CIL rather than the context, because the context is pinned to the CIL until the switch is done and we aggregate and drain the per-cpu state held on the CIL during the context switch.
However, because we no longer serialise the CIL work, we can have effectively unlimited CIL pushes in progress. We don't want to do this - not only does it create contention on the iclogs and the state machine locks, we can run the log right out of space with outstanding pushes. Instead, limit the work concurrency to 4 concurrent works being processed at a time. This is enough concurrency to remove the CIL from being a CPU bound bottleneck but not enough to create new contention points or unbound concurrency issues.
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_log_cil.c | 80 +++++++++++++++++++++++-------------------- fs/xfs/xfs_log_priv.h | 2 +- fs/xfs/xfs_super.c | 6 +++- 3 files changed, 48 insertions(+), 40 deletions(-)
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index f31e72f5cd2c..a35a25935310 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -47,6 +47,34 @@ xlog_cil_ticket_alloc( return tic; }
+/* + * Unavoidable forward declaration - xlog_cil_push_work() calls + * xlog_cil_ctx_alloc() itself. + */ +static void xlog_cil_push_work(struct work_struct *work); + +static struct xfs_cil_ctx * +xlog_cil_ctx_alloc(void) +{ + struct xfs_cil_ctx *ctx; + + ctx = kmem_zalloc(sizeof(*ctx), KM_NOFS); + INIT_LIST_HEAD(&ctx->committing); + INIT_LIST_HEAD(&ctx->busy_extents); + INIT_WORK(&ctx->push_work, xlog_cil_push_work); + return ctx; +} + +static void +xlog_cil_ctx_switch( + struct xfs_cil *cil, + struct xfs_cil_ctx *ctx) +{ + ctx->sequence = ++cil->xc_current_sequence; + ctx->cil = cil; + cil->xc_ctx = ctx; +} + /* * After the first stage of log recovery is done, we know where the head and * tail of the log are. We need this log initialisation done before we can @@ -649,11 +677,11 @@ static void xlog_cil_push_work( struct work_struct *work) { - struct xfs_cil *cil = - container_of(work, struct xfs_cil, xc_push_work); + struct xfs_cil_ctx *ctx = + container_of(work, struct xfs_cil_ctx, push_work); + struct xfs_cil *cil = ctx->cil; struct xlog *log = cil->xc_log; struct xfs_log_vec *lv; - struct xfs_cil_ctx *ctx; struct xfs_cil_ctx *new_ctx; struct xlog_in_core *commit_iclog; struct xlog_ticket *tic; @@ -669,11 +697,10 @@ xlog_cil_push_work( DECLARE_COMPLETION_ONSTACK(bdev_flush); bool push_commit_stable;
- new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_NOFS); + new_ctx = xlog_cil_ctx_alloc(); new_ctx->ticket = xlog_cil_ticket_alloc(log);
down_write(&cil->xc_ctx_lock); - ctx = cil->xc_ctx;
spin_lock(&cil->xc_push_lock); push_seq = cil->xc_push_seq; @@ -705,7 +732,7 @@ xlog_cil_push_work(
/* check for a previously pushed sequence */ - if (push_seq < cil->xc_ctx->sequence) { + if (push_seq < ctx->sequence) { spin_unlock(&cil->xc_push_lock); goto out_skip; } @@ -778,19 +805,7 @@ xlog_cil_push_work( }
/* - * initialise the new context and attach it to the CIL. Then attach - * the current context to the CIL committing list so it can be found - * during log forces to extract the commit lsn of the sequence that - * needs to be forced. - */ - INIT_LIST_HEAD(&new_ctx->committing); - INIT_LIST_HEAD(&new_ctx->busy_extents); - new_ctx->sequence = ctx->sequence + 1; - new_ctx->cil = cil; - cil->xc_ctx = new_ctx; - - /* - * The switch is now done, so we can drop the context lock and move out + * Switch the contexts so we can drop the context lock and move out * of a shared context. We can't just go straight to the commit record, * though - we need to synchronise with previous and future commits so * that the commit records are correctly ordered in the log to ensure @@ -815,7 +830,7 @@ xlog_cil_push_work( * deferencing a freed context pointer. */ spin_lock(&cil->xc_push_lock); - cil->xc_current_sequence = new_ctx->sequence; + xlog_cil_ctx_switch(cil, new_ctx); spin_unlock(&cil->xc_push_lock); up_write(&cil->xc_ctx_lock);
@@ -1020,7 +1035,7 @@ xlog_cil_push_background( spin_lock(&cil->xc_push_lock); if (cil->xc_push_seq < cil->xc_current_sequence) { cil->xc_push_seq = cil->xc_current_sequence; - queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work); + queue_work(log->l_mp->m_cil_workqueue, &cil->xc_ctx->push_work); }
/* @@ -1086,7 +1101,7 @@ xlog_cil_push_now(
/* start on any pending background push to minimise wait time on it */ if (!async) - flush_work(&cil->xc_push_work); + flush_workqueue(log->l_mp->m_cil_workqueue);
/* * If the CIL is empty or we've already pushed the sequence then @@ -1100,7 +1115,7 @@ xlog_cil_push_now(
cil->xc_push_seq = push_seq; cil->xc_push_commit_stable = async; - queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work); + queue_work(log->l_mp->m_cil_workqueue, &cil->xc_ctx->push_work); spin_unlock(&cil->xc_push_lock); }
@@ -1340,13 +1355,6 @@ xlog_cil_init( if (!cil) return -ENOMEM;
- ctx = kmem_zalloc(sizeof(*ctx), KM_MAYFAIL); - if (!ctx) { - kmem_free(cil); - return -ENOMEM; - } - - INIT_WORK(&cil->xc_push_work, xlog_cil_push_work); INIT_LIST_HEAD(&cil->xc_cil); INIT_LIST_HEAD(&cil->xc_committing); spin_lock_init(&cil->xc_cil_lock); @@ -1354,16 +1362,12 @@ xlog_cil_init( init_waitqueue_head(&cil->xc_push_wait); init_rwsem(&cil->xc_ctx_lock); init_waitqueue_head(&cil->xc_commit_wait); - - INIT_LIST_HEAD(&ctx->committing); - INIT_LIST_HEAD(&ctx->busy_extents); - ctx->sequence = 1; - ctx->cil = cil; - cil->xc_ctx = ctx; - cil->xc_current_sequence = ctx->sequence; - cil->xc_log = log; log->l_cilp = cil; + + ctx = xlog_cil_ctx_alloc(); + xlog_cil_ctx_switch(cil, ctx); + return 0; }
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 90fee866c613..587207d2d7f5 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -252,6 +252,7 @@ struct xfs_cil_ctx { struct list_head iclog_entry; struct list_head committing; /* ctx committing list */ struct work_struct discard_endio_work; + struct work_struct push_work; };
/* @@ -284,7 +285,6 @@ struct xfs_cil { struct list_head xc_committing; wait_queue_head_t xc_commit_wait; xfs_csn_t xc_current_sequence; - struct work_struct xc_push_work; wait_queue_head_t xc_push_wait; /* background push throttle */ } ____cacheline_aligned_in_smp;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 0c59b538bab5..b6b41f0d3c71 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -524,9 +524,13 @@ xfs_init_mount_workqueues( if (!mp->m_unwritten_workqueue) goto out_destroy_buf;
+ /* + * Limit the CIL pipeline depth to 4 concurrent works to bound the + * concurrency the log spinlocks will be exposed to. + */ mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s", XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_UNBOUND), - 0, mp->m_super->s_id); + 4, mp->m_super->s_id); if (!mp->m_cil_workqueue) goto out_destroy_unwritten;
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.14-rc4 commit 33c0dd7898a11ef19169abe5c5049fa6aa099c64 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
We only use the CIL workqueue in the CIL, so it makes no sense to hang it off the xfs_mount and have to walk multiple pointers back up to the mount when we have the CIL structures right there.
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_log_cil.c | 20 +++++++++++++++++--- fs/xfs/xfs_log_priv.h | 1 + fs/xfs/xfs_mount.h | 1 - fs/xfs/xfs_super.c | 15 +-------------- 4 files changed, 19 insertions(+), 18 deletions(-)
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index a35a25935310..d17e8b7e7fc6 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -1035,7 +1035,7 @@ xlog_cil_push_background( spin_lock(&cil->xc_push_lock); if (cil->xc_push_seq < cil->xc_current_sequence) { cil->xc_push_seq = cil->xc_current_sequence; - queue_work(log->l_mp->m_cil_workqueue, &cil->xc_ctx->push_work); + queue_work(cil->xc_push_wq, &cil->xc_ctx->push_work); }
/* @@ -1101,7 +1101,7 @@ xlog_cil_push_now(
/* start on any pending background push to minimise wait time on it */ if (!async) - flush_workqueue(log->l_mp->m_cil_workqueue); + flush_workqueue(cil->xc_push_wq);
/* * If the CIL is empty or we've already pushed the sequence then @@ -1115,7 +1115,7 @@ xlog_cil_push_now(
cil->xc_push_seq = push_seq; cil->xc_push_commit_stable = async; - queue_work(log->l_mp->m_cil_workqueue, &cil->xc_ctx->push_work); + queue_work(cil->xc_push_wq, &cil->xc_ctx->push_work); spin_unlock(&cil->xc_push_lock); }
@@ -1354,6 +1354,15 @@ xlog_cil_init( cil = kmem_zalloc(sizeof(*cil), KM_MAYFAIL); if (!cil) return -ENOMEM; + /* + * Limit the CIL pipeline depth to 4 concurrent works to bound the + * concurrency the log spinlocks will be exposed to. + */ + cil->xc_push_wq = alloc_workqueue("xfs-cil/%s", + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_UNBOUND), + 4, log->l_mp->m_super->s_id); + if (!cil->xc_push_wq) + goto out_destroy_cil;
INIT_LIST_HEAD(&cil->xc_cil); INIT_LIST_HEAD(&cil->xc_committing); @@ -1369,6 +1378,10 @@ xlog_cil_init( xlog_cil_ctx_switch(cil, ctx);
return 0; + +out_destroy_cil: + kmem_free(cil); + return -ENOMEM; }
void @@ -1382,6 +1395,7 @@ xlog_cil_destroy( }
ASSERT(list_empty(&log->l_cilp->xc_cil)); + destroy_workqueue(log->l_cilp->xc_push_wq); kmem_free(log->l_cilp); }
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 587207d2d7f5..ef96bd5e7e56 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -275,6 +275,7 @@ struct xfs_cil { struct xlog *xc_log; struct list_head xc_cil; spinlock_t xc_cil_lock; + struct workqueue_struct *xc_push_wq;
struct rw_semaphore xc_ctx_lock ____cacheline_aligned_in_smp; struct xfs_cil_ctx *xc_ctx; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index b75f0e7b67ef..46dba3289d10 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -106,7 +106,6 @@ typedef struct xfs_mount { struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ struct workqueue_struct *m_buf_workqueue; struct workqueue_struct *m_unwritten_workqueue; - struct workqueue_struct *m_cil_workqueue; struct workqueue_struct *m_reclaim_workqueue; struct workqueue_struct *m_sync_workqueue; struct workqueue_struct *m_blockgc_wq; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index b6b41f0d3c71..475bb2ffd4f4 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -524,21 +524,11 @@ xfs_init_mount_workqueues( if (!mp->m_unwritten_workqueue) goto out_destroy_buf;
- /* - * Limit the CIL pipeline depth to 4 concurrent works to bound the - * concurrency the log spinlocks will be exposed to. - */ - mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_UNBOUND), - 4, mp->m_super->s_id); - if (!mp->m_cil_workqueue) - goto out_destroy_unwritten; - mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s", XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), 0, mp->m_super->s_id); if (!mp->m_reclaim_workqueue) - goto out_destroy_cil; + goto out_destroy_unwritten;
mp->m_blockgc_wq = alloc_workqueue("xfs-blockgc/%s", XFS_WQFLAGS(WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM), @@ -565,8 +555,6 @@ xfs_init_mount_workqueues( destroy_workqueue(mp->m_blockgc_wq); out_destroy_reclaim: destroy_workqueue(mp->m_reclaim_workqueue); -out_destroy_cil: - destroy_workqueue(mp->m_cil_workqueue); out_destroy_unwritten: destroy_workqueue(mp->m_unwritten_workqueue); out_destroy_buf: @@ -583,7 +571,6 @@ xfs_destroy_mount_workqueues( destroy_workqueue(mp->m_blockgc_wq); destroy_workqueue(mp->m_inodegc_wq); destroy_workqueue(mp->m_reclaim_workqueue); - destroy_workqueue(mp->m_cil_workqueue); destroy_workqueue(mp->m_unwritten_workqueue); destroy_workqueue(mp->m_buf_workqueue); }
From: Wei Yongjun weiyongjun1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4PNEK CVE: NA
-------------------------------------------------
Add config item CONFIG_TCP_COMP for tcp payload compression.
This allows payload compression handling of the TCP protocol to be done in-kernel.
This patch only adds the CONFIG_TCP_COMP config, tcp compression capability is implemented later.
Signed-off-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Wang Yufen wangyufen@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/ipv4/Kconfig | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 87983e70f03f..59ffbf80f7f2 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -742,3 +742,11 @@ config TCP_MD5SIG on the Internet.
If unsure, say N. + +config TCP_COMP + bool "TCP: Transport Layer Compression support" + help + Enable kernel payload compression support for TCP protocol. This allows + payload compression handling of the TCP protocol to be done in-kernel. + + If unsure, say Y.
From: Wang Yufen wangyufen@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4PNEK CVE: NA
-------------------------------------------------
Add new tcp COMP option to SYN and SYN-ACK when tcp COMP is enabled. connection compress payload only when both side support it.
Signed-off-by: Wang Yufen wangyufen@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/tcp.h | 6 ++++- include/net/inet_sock.h | 3 ++- include/net/sock.h | 1 + include/net/tcp.h | 12 ++++++++++ net/ipv4/Makefile | 1 + net/ipv4/syncookies.c | 2 ++ net/ipv4/tcp.c | 4 ++++ net/ipv4/tcp_comp.c | 13 ++++++++++ net/ipv4/tcp_input.c | 25 +++++++++++++++++++ net/ipv4/tcp_minisocks.c | 3 +++ net/ipv4/tcp_output.c | 52 ++++++++++++++++++++++++++++++++++++++++ net/ipv6/syncookies.c | 2 ++ 12 files changed, 122 insertions(+), 2 deletions(-) create mode 100644 net/ipv4/tcp_comp.c
diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 2f87377e9af7..5293d504b09b 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -93,7 +93,8 @@ struct tcp_options_received { snd_wscale : 4, /* Window scaling received from sender */ rcv_wscale : 4; /* Window scaling to send to receiver */ u8 saw_unknown:1, /* Received unknown option */ - unused:7; + comp_ok:1, /* COMP seen on SYN packet */ + unused:6; u8 num_sacks; /* Number of SACK blocks */ u16 user_mss; /* mss requested by user in ioctl */ u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ @@ -106,6 +107,9 @@ static inline void tcp_clear_options(struct tcp_options_received *rx_opt) #if IS_ENABLED(CONFIG_SMC) rx_opt->smc_ok = 0; #endif +#if IS_ENABLED(CONFIG_TCP_COMP) + rx_opt->comp_ok = 0; +#endif }
/* This is the max number of SACKS that we'll generate and process. It's safe diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 89163ef8cf4b..f29059a262da 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -87,7 +87,8 @@ struct inet_request_sock { ecn_ok : 1, acked : 1, no_srccheck: 1, - smc_ok : 1; + smc_ok : 1, + comp_ok : 1; u32 ir_mark; union { struct ip_options_rcu __rcu *ireq_opt; diff --git a/include/net/sock.h b/include/net/sock.h index 16b085f6c0bc..712bb7b09f96 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -873,6 +873,7 @@ enum sock_flags { SOCK_TXTIME, SOCK_XDP, /* XDP is attached */ SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */ + SOCK_COMP, };
#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) diff --git a/include/net/tcp.h b/include/net/tcp.h index bbd8fb63f4c1..130e7fe4537c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -195,6 +195,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); */ #define TCPOPT_FASTOPEN_MAGIC 0xF989 #define TCPOPT_SMC_MAGIC 0xE2D4C3D9 +#define TCPOPT_COMP_MAGIC 0x7954
/* * TCP option lengths @@ -208,6 +209,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCPOLEN_FASTOPEN_BASE 2 #define TCPOLEN_EXP_FASTOPEN_BASE 4 #define TCPOLEN_EXP_SMC_BASE 6 +#define TCPOLEN_EXP_COMP_BASE 4
/* But this is what stacks really send out. */ #define TCPOLEN_TSTAMP_ALIGNED 12 @@ -2378,4 +2380,14 @@ static inline u64 tcp_transmit_time(const struct sock *sk) return 0; }
+#if IS_ENABLED(CONFIG_TCP_COMP) +extern struct static_key_false tcp_have_comp; +bool tcp_syn_comp_enabled(const struct tcp_sock *tp); +#else +static inline bool tcp_syn_comp_enabled(const struct tcp_sock *tp) +{ + return false; +} +#endif + #endif /* _TCP_H */ diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 5b77a46885b9..1f3846c3a154 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -64,6 +64,7 @@ obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o obj-$(CONFIG_BPF_STREAM_PARSER) += udp_bpf.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o +obj-$(CONFIG_TCP_COMP) += tcp_comp.o
obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o xfrm4_protocol.o diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 00dc3f943c80..0248216c92ca 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -392,6 +392,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
if (IS_ENABLED(CONFIG_SMC)) ireq->smc_ok = 0; + if (IS_ENABLED(CONFIG_TCP_COMP)) + ireq->comp_ok = 0;
ireq->ir_iif = inet_request_bound_dev_if(sk, skb);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e8aca226c4ae..4d7bb05df4e4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -294,6 +294,10 @@ DEFINE_STATIC_KEY_FALSE(tcp_have_smc); EXPORT_SYMBOL(tcp_have_smc); #endif
+#if IS_ENABLED(CONFIG_TCP_COMP) +DEFINE_STATIC_KEY_FALSE(tcp_have_comp); +#endif + /* * Current number of TCP sockets. */ diff --git a/net/ipv4/tcp_comp.c b/net/ipv4/tcp_comp.c new file mode 100644 index 000000000000..e2bf4fbb4c3f --- /dev/null +++ b/net/ipv4/tcp_comp.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * TCP compression support + * + * Copyright(c) 2021 Huawei Technologies Co., Ltd + */ + +#include <net/tcp.h> + +bool tcp_syn_comp_enabled(const struct tcp_sock *tp) +{ + return true; +} diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 991e3434957b..e0c05a9e1274 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3905,6 +3905,24 @@ static bool smc_parse_options(const struct tcphdr *th, return false; }
+static bool tcp_parse_comp_option(const struct tcphdr *th, + struct tcp_options_received *opt_rx, + const unsigned char *ptr, + int opsize) +{ +#if IS_ENABLED(CONFIG_TCP_COMP) + if (static_branch_unlikely(&tcp_have_comp)) { + if (th->syn && !(opsize & 1) && + opsize >= TCPOLEN_EXP_COMP_BASE && + get_unaligned_be16(ptr) == TCPOPT_COMP_MAGIC) { + opt_rx->comp_ok = 1; + return true; + } + } +#endif + return false; +} + /* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped * value on success. */ @@ -4064,6 +4082,10 @@ void tcp_parse_options(const struct net *net, if (smc_parse_options(th, opt_rx, ptr, opsize)) break;
+ if (tcp_parse_comp_option(th, opt_rx, ptr, + opsize)) + break; + opt_rx->saw_unknown = 1; break;
@@ -6634,6 +6656,9 @@ static void tcp_openreq_init(struct request_sock *req, #if IS_ENABLED(CONFIG_SMC) ireq->smc_ok = rx_opt->smc_ok; #endif +#if IS_ENABLED(CONFIG_TCP_COMP) + ireq->comp_ok = rx_opt->comp_ok; +#endif }
struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index f0f67b25c97a..dafa5bc71bba 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -512,6 +512,9 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->rcv_ssthresh = req->rsk_rcv_wnd; newtp->rcv_wnd = req->rsk_rcv_wnd; newtp->rx_opt.wscale_ok = ireq->wscale_ok; +#if IS_ENABLED(CONFIG_TCP_COMP) + newtp->rx_opt.comp_ok = ireq->comp_ok; +#endif if (newtp->rx_opt.wscale_ok) { newtp->rx_opt.snd_wscale = ireq->snd_wscale; newtp->rx_opt.rcv_wscale = ireq->rcv_wscale; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 19ef4577b70d..7e95af49acba 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -416,6 +416,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_FAST_OPEN_COOKIE (1 << 8) #define OPTION_SMC (1 << 9) #define OPTION_MPTCP (1 << 10) +#define OPTION_COMP (1 << 11)
static void smc_options_write(__be32 *ptr, u16 *options) { @@ -432,6 +433,19 @@ static void smc_options_write(__be32 *ptr, u16 *options) #endif }
+static void comp_options_write(__be32 *ptr, u16 *options) +{ +#if IS_ENABLED(CONFIG_TCP_COMP) + if (static_branch_unlikely(&tcp_have_comp)) { + if (unlikely(OPTION_COMP & *options)) { + *ptr++ = htonl((TCPOPT_EXP << 24) | + (TCPOLEN_EXP_COMP_BASE << 16) | + (TCPOPT_COMP_MAGIC)); + } + } +#endif +} + struct tcp_out_options { u16 options; /* bit field of OPTION_* */ u16 mss; /* 0 to disable */ @@ -702,6 +716,8 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, smc_options_write(ptr, &options);
mptcp_options_write(ptr, opts); + + comp_options_write(ptr, &options); }
static void smc_set_option(const struct tcp_sock *tp, @@ -720,6 +736,39 @@ static void smc_set_option(const struct tcp_sock *tp, #endif }
+static void comp_set_option(const struct tcp_sock *tp, + struct tcp_out_options *opts, + unsigned int *remaining) +{ +#if IS_ENABLED(CONFIG_TCP_COMP) + if (static_branch_unlikely(&tcp_have_comp)) { + if (tcp_syn_comp_enabled(tp)) { + if (*remaining >= TCPOLEN_EXP_COMP_BASE) { + opts->options |= OPTION_COMP; + *remaining -= TCPOLEN_EXP_COMP_BASE; + } + } + } +#endif +} + +static void comp_set_option_cond(const struct tcp_sock *tp, + const struct inet_request_sock *ireq, + struct tcp_out_options *opts, + unsigned int *remaining) +{ +#if IS_ENABLED(CONFIG_TCP_COMP) + if (static_branch_unlikely(&tcp_have_comp)) { + if (tcp_syn_comp_enabled(tp) && ireq->comp_ok) { + if (*remaining >= TCPOLEN_EXP_COMP_BASE) { + opts->options |= OPTION_COMP; + *remaining -= TCPOLEN_EXP_COMP_BASE; + } + } + } +#endif +} + static void smc_set_option_cond(const struct tcp_sock *tp, const struct inet_request_sock *ireq, struct tcp_out_options *opts, @@ -821,6 +870,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, }
smc_set_option(tp, opts, &remaining); + comp_set_option(tp, opts, &remaining);
if (sk_is_mptcp(sk)) { unsigned int size; @@ -901,6 +951,8 @@ static unsigned int tcp_synack_options(const struct sock *sk,
smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
+ comp_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); + bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb, synack_type, opts, &remaining);
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 9b6cae1e49d9..49e9342ee551 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -214,6 +214,8 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) treq->txhash = net_tx_rndhash(); if (IS_ENABLED(CONFIG_SMC)) ireq->smc_ok = 0; + if (IS_ENABLED(CONFIG_TCP_COMP)) + ireq->comp_ok = 0;
/* * We need to lookup the dst_entry to get the correct window size.
From: Wang Yufen wangyufen@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4PNEK CVE: NA
-------------------------------------------------
When establishing a tcp connection or closing it, the tcp compression needs to be initialized or cleaned up at the same time.
Add dummy init and cleanup hook for tcp compression. It will be implemented later.
Signed-off-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Wang Yufen wangyufen@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/net/tcp.h | 9 +++++++++ net/ipv4/tcp_comp.c | 8 ++++++++ net/ipv4/tcp_input.c | 1 + net/ipv4/tcp_ipv4.c | 2 ++ 4 files changed, 20 insertions(+)
diff --git a/include/net/tcp.h b/include/net/tcp.h index 130e7fe4537c..edd7bfa87871 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2383,11 +2383,20 @@ static inline u64 tcp_transmit_time(const struct sock *sk) #if IS_ENABLED(CONFIG_TCP_COMP) extern struct static_key_false tcp_have_comp; bool tcp_syn_comp_enabled(const struct tcp_sock *tp); +void tcp_init_compression(struct sock *sk); +void tcp_cleanup_compression(struct sock *sk); #else static inline bool tcp_syn_comp_enabled(const struct tcp_sock *tp) { return false; } +static inline void tcp_init_compression(struct sock *sk) +{ +} + +static inline void tcp_cleanup_compression(struct sock *sk) +{ +} #endif
#endif /* _TCP_H */ diff --git a/net/ipv4/tcp_comp.c b/net/ipv4/tcp_comp.c index e2bf4fbb4c3f..067d48b72429 100644 --- a/net/ipv4/tcp_comp.c +++ b/net/ipv4/tcp_comp.c @@ -11,3 +11,11 @@ bool tcp_syn_comp_enabled(const struct tcp_sock *tp) { return true; } + +void tcp_init_compression(struct sock *sk) +{ +} + +void tcp_cleanup_compression(struct sock *sk) +{ +} diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e0c05a9e1274..711bf63dc026 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5937,6 +5937,7 @@ void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb) /* Initialize congestion control unless BPF initialized it already: */ if (!icsk->icsk_ca_initialized) tcp_init_congestion_control(sk); + tcp_init_compression(sk); tcp_init_buffer_space(sk); }
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 017cd666387f..ebfeeeadd47c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2218,6 +2218,8 @@ void tcp_v4_destroy_sock(struct sock *sk)
tcp_cleanup_congestion_control(sk);
+ tcp_cleanup_compression(sk); + tcp_cleanup_ulp(sk);
/* Cleanup up the write buffer. */
From: Wei Yongjun weiyongjun1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4PNEK CVE: NA
-------------------------------------------------
Add sysctl interface for enable/disable tcp compression by ports.
Example:
$ echo 4000 > /proc/sys/net/ipv4/tcp_compression_ports will enable port 4000 for tcp compression
$ echo 4000,5000 > /proc/sys/net/ipv4/tcp_compression_ports will enable both port 4000 and 5000 for tcp compression
$ echo > /proc/sys/net/ipv4/tcp_compression_ports will disable tcp compression.
Signed-off-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Wang Yufen wangyufen@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/net/tcp.h | 3 +++ net/ipv4/sysctl_net_ipv4.c | 33 +++++++++++++++++++++++++++++++++ net/ipv4/tcp_comp.c | 4 ++++ 3 files changed, 40 insertions(+)
diff --git a/include/net/tcp.h b/include/net/tcp.h index edd7bfa87871..9725b1038c84 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2382,6 +2382,9 @@ static inline u64 tcp_transmit_time(const struct sock *sk)
#if IS_ENABLED(CONFIG_TCP_COMP) extern struct static_key_false tcp_have_comp; + +extern unsigned long *sysctl_tcp_compression_ports; + bool tcp_syn_comp_enabled(const struct tcp_sock *tp); void tcp_init_compression(struct sock *sk); void tcp_cleanup_compression(struct sock *sk); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 08829809e88b..bbd641bb27f4 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -465,6 +465,30 @@ static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write, } #endif
+#if IS_ENABLED(CONFIG_TCP_COMP) +static int proc_tcp_compression_ports(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + unsigned long *bitmap = *(unsigned long **)table->data; + unsigned long bitmap_len = table->maxlen; + int ret; + + ret = proc_do_large_bitmap(table, write, buffer, lenp, ppos); + if (write && ret == 0) { + if (bitmap_empty(bitmap, bitmap_len)) { + if (static_key_enabled(&tcp_have_comp)) + static_branch_disable(&tcp_have_comp); + } else { + if (!static_key_enabled(&tcp_have_comp)) + static_branch_enable(&tcp_have_comp); + } + } + + return ret; +} +#endif + static struct ctl_table ipv4_table[] = { { .procname = "tcp_max_orphans", @@ -588,6 +612,15 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_do_static_key, }, +#if IS_ENABLED(CONFIG_TCP_COMP) + { + .procname = "tcp_compression_ports", + .data = &sysctl_tcp_compression_ports, + .maxlen = 65536, + .mode = 0644, + .proc_handler = proc_tcp_compression_ports, + }, +#endif { } };
diff --git a/net/ipv4/tcp_comp.c b/net/ipv4/tcp_comp.c index 067d48b72429..3493255d34df 100644 --- a/net/ipv4/tcp_comp.c +++ b/net/ipv4/tcp_comp.c @@ -7,6 +7,10 @@
#include <net/tcp.h>
+static unsigned long tcp_compression_ports[65536 / 8]; + +unsigned long *sysctl_tcp_compression_ports = tcp_compression_ports; + bool tcp_syn_comp_enabled(const struct tcp_sock *tp) { return true;
From: Wang Yufen wangyufen@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4PNEK CVE: NA
-------------------------------------------------
Only enable compression for give server ports, this means we will check either dport when send SYN or sport when send SYN-ACK.
Signed-off-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Wang Yufen wangyufen@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/net/tcp.h | 2 +- net/ipv4/tcp_comp.c | 18 ++++++++++++++++-- net/ipv4/tcp_output.c | 12 ++++++------ 3 files changed, 23 insertions(+), 9 deletions(-)
diff --git a/include/net/tcp.h b/include/net/tcp.h index 9725b1038c84..353a0d9e4c8e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2385,7 +2385,7 @@ extern struct static_key_false tcp_have_comp;
extern unsigned long *sysctl_tcp_compression_ports;
-bool tcp_syn_comp_enabled(const struct tcp_sock *tp); +bool tcp_syn_comp_enabled(const struct sock *sk, bool active); void tcp_init_compression(struct sock *sk); void tcp_cleanup_compression(struct sock *sk); #else diff --git a/net/ipv4/tcp_comp.c b/net/ipv4/tcp_comp.c index 3493255d34df..fb76813aa106 100644 --- a/net/ipv4/tcp_comp.c +++ b/net/ipv4/tcp_comp.c @@ -11,13 +11,27 @@ static unsigned long tcp_compression_ports[65536 / 8];
unsigned long *sysctl_tcp_compression_ports = tcp_compression_ports;
-bool tcp_syn_comp_enabled(const struct tcp_sock *tp) +bool tcp_syn_comp_enabled(const struct sock *sk, bool active) { - return true; + struct inet_sock *inet = inet_sk(sk); + int port; + + if (active) + port = ntohs(inet->inet_dport); + else + port = ntohs(inet->inet_sport); + + return test_bit(port, sysctl_tcp_compression_ports); }
void tcp_init_compression(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); + + if (!tp->rx_opt.comp_ok) + return; + + sock_set_flag(sk, SOCK_COMP); }
void tcp_cleanup_compression(struct sock *sk) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 7e95af49acba..3ef6b1186f45 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -736,13 +736,13 @@ static void smc_set_option(const struct tcp_sock *tp, #endif }
-static void comp_set_option(const struct tcp_sock *tp, +static void comp_set_option(const struct sock *sk, struct tcp_out_options *opts, unsigned int *remaining) { #if IS_ENABLED(CONFIG_TCP_COMP) if (static_branch_unlikely(&tcp_have_comp)) { - if (tcp_syn_comp_enabled(tp)) { + if (tcp_syn_comp_enabled(sk, true)) { if (*remaining >= TCPOLEN_EXP_COMP_BASE) { opts->options |= OPTION_COMP; *remaining -= TCPOLEN_EXP_COMP_BASE; @@ -752,14 +752,14 @@ static void comp_set_option(const struct tcp_sock *tp, #endif }
-static void comp_set_option_cond(const struct tcp_sock *tp, +static void comp_set_option_cond(const struct sock *sk, const struct inet_request_sock *ireq, struct tcp_out_options *opts, unsigned int *remaining) { #if IS_ENABLED(CONFIG_TCP_COMP) if (static_branch_unlikely(&tcp_have_comp)) { - if (tcp_syn_comp_enabled(tp) && ireq->comp_ok) { + if (tcp_syn_comp_enabled(sk, false) && ireq->comp_ok) { if (*remaining >= TCPOLEN_EXP_COMP_BASE) { opts->options |= OPTION_COMP; *remaining -= TCPOLEN_EXP_COMP_BASE; @@ -870,7 +870,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, }
smc_set_option(tp, opts, &remaining); - comp_set_option(tp, opts, &remaining); + comp_set_option(sk, opts, &remaining);
if (sk_is_mptcp(sk)) { unsigned int size; @@ -951,7 +951,7 @@ static unsigned int tcp_synack_options(const struct sock *sk,
smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
- comp_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); + comp_set_option_cond(sk, ireq, opts, &remaining);
bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb, synack_type, opts, &remaining);
From: Wang Hai wanghai38@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4PNEK CVE: NA
-------------------------------------------------
Tcp compression is used to reduce the amount of data transmitted between multiple machines, which can increase the transmission capacity.
The local tcp connection is a single machine transfer, so there is no meaning to use tcp compression. Ignore it by default.
Enable by sysctl:
echo 1 > /proc/net/ipv4/tcp_compression_local
Signed-off-by: Wang Hai wanghai38@huawei.com Signed-off-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Wang Yufen wangyufen@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/net/tcp.h | 12 +++++++++++- net/ipv4/sysctl_net_ipv4.c | 9 +++++++++ net/ipv4/tcp_comp.c | 31 ++++++++++++++++++++++++------- net/ipv4/tcp_output.c | 4 ++-- 4 files changed, 46 insertions(+), 10 deletions(-)
diff --git a/include/net/tcp.h b/include/net/tcp.h index 353a0d9e4c8e..cee41cca6d2c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2384,8 +2384,11 @@ static inline u64 tcp_transmit_time(const struct sock *sk) extern struct static_key_false tcp_have_comp;
extern unsigned long *sysctl_tcp_compression_ports; +extern int sysctl_tcp_compression_local;
-bool tcp_syn_comp_enabled(const struct sock *sk, bool active); +bool tcp_syn_comp_enabled(const struct sock *sk); +bool tcp_synack_comp_enabled(const struct sock *sk, + const struct inet_request_sock *ireq); void tcp_init_compression(struct sock *sk); void tcp_cleanup_compression(struct sock *sk); #else @@ -2393,6 +2396,13 @@ static inline bool tcp_syn_comp_enabled(const struct tcp_sock *tp) { return false; } + +static inline bool tcp_synack_comp_enabled(const struct sock *sk, + const struct inet_request_sock *ireq) +{ + return false; +} + static inline void tcp_init_compression(struct sock *sk) { } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index bbd641bb27f4..1cffacd985c1 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -620,6 +620,15 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_tcp_compression_ports, }, + { + .procname = "tcp_compression_local", + .data = &sysctl_tcp_compression_local, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, #endif { } }; diff --git a/net/ipv4/tcp_comp.c b/net/ipv4/tcp_comp.c index fb76813aa106..afcd50c89e78 100644 --- a/net/ipv4/tcp_comp.c +++ b/net/ipv4/tcp_comp.c @@ -10,18 +10,35 @@ static unsigned long tcp_compression_ports[65536 / 8];
unsigned long *sysctl_tcp_compression_ports = tcp_compression_ports; +int sysctl_tcp_compression_local __read_mostly;
-bool tcp_syn_comp_enabled(const struct sock *sk, bool active) +static bool tcp_comp_enabled(__be32 saddr, __be32 daddr, int port) +{ + if (!sysctl_tcp_compression_local && + (saddr == daddr || ipv4_is_loopback(daddr))) + return false; + + return test_bit(port, sysctl_tcp_compression_ports); +} + +bool tcp_syn_comp_enabled(const struct sock *sk) { struct inet_sock *inet = inet_sk(sk); - int port;
- if (active) - port = ntohs(inet->inet_dport); - else - port = ntohs(inet->inet_sport); + return tcp_comp_enabled(inet->inet_saddr, inet->inet_daddr, + ntohs(inet->inet_dport)); +}
- return test_bit(port, sysctl_tcp_compression_ports); +bool tcp_synack_comp_enabled(const struct sock *sk, + const struct inet_request_sock *ireq) +{ + struct inet_sock *inet = inet_sk(sk); + + if (!ireq->comp_ok) + return false; + + return tcp_comp_enabled(ireq->ir_loc_addr, ireq->ir_rmt_addr, + ntohs(inet->inet_sport)); }
void tcp_init_compression(struct sock *sk) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3ef6b1186f45..c8420b48de36 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -742,7 +742,7 @@ static void comp_set_option(const struct sock *sk, { #if IS_ENABLED(CONFIG_TCP_COMP) if (static_branch_unlikely(&tcp_have_comp)) { - if (tcp_syn_comp_enabled(sk, true)) { + if (tcp_syn_comp_enabled(sk)) { if (*remaining >= TCPOLEN_EXP_COMP_BASE) { opts->options |= OPTION_COMP; *remaining -= TCPOLEN_EXP_COMP_BASE; @@ -759,7 +759,7 @@ static void comp_set_option_cond(const struct sock *sk, { #if IS_ENABLED(CONFIG_TCP_COMP) if (static_branch_unlikely(&tcp_have_comp)) { - if (tcp_syn_comp_enabled(sk, false) && ireq->comp_ok) { + if (tcp_synack_comp_enabled(sk, ireq)) { if (*remaining >= TCPOLEN_EXP_COMP_BASE) { opts->options |= OPTION_COMP; *remaining -= TCPOLEN_EXP_COMP_BASE;
From: Wang Yufen wangyufen@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4PNEK CVE: NA
-------------------------------------------------
Add stub proto ops for tcp compression socket.
Signed-off-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Wang Yufen wangyufen@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/net/tcp.h | 6 +++++ net/ipv4/tcp.c | 1 + net/ipv4/tcp_comp.c | 66 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 73 insertions(+)
diff --git a/include/net/tcp.h b/include/net/tcp.h index cee41cca6d2c..9475e3da53eb 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2391,6 +2391,7 @@ bool tcp_synack_comp_enabled(const struct sock *sk, const struct inet_request_sock *ireq); void tcp_init_compression(struct sock *sk); void tcp_cleanup_compression(struct sock *sk); +int tcp_comp_init(void); #else static inline bool tcp_syn_comp_enabled(const struct tcp_sock *tp) { @@ -2410,6 +2411,11 @@ static inline void tcp_init_compression(struct sock *sk) static inline void tcp_cleanup_compression(struct sock *sk) { } + +static inline int tcp_comp_init(void) +{ + return 0; +} #endif
#endif /* _TCP_H */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4d7bb05df4e4..1d6efb6c6542 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4248,5 +4248,6 @@ void __init tcp_init(void) tcp_metrics_init(); BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0); tcp_tasklet_init(); + tcp_comp_init(); mptcp_init(); } diff --git a/net/ipv4/tcp_comp.c b/net/ipv4/tcp_comp.c index afcd50c89e78..fb90be4d9e9c 100644 --- a/net/ipv4/tcp_comp.c +++ b/net/ipv4/tcp_comp.c @@ -12,6 +12,13 @@ static unsigned long tcp_compression_ports[65536 / 8]; unsigned long *sysctl_tcp_compression_ports = tcp_compression_ports; int sysctl_tcp_compression_local __read_mostly;
+static struct proto tcp_prot_override; + +struct tcp_comp_context { + struct proto *sk_proto; + struct rcu_head rcu; +}; + static bool tcp_comp_enabled(__be32 saddr, __be32 daddr, int port) { if (!sysctl_tcp_compression_local && @@ -41,16 +48,75 @@ bool tcp_synack_comp_enabled(const struct sock *sk, ntohs(inet->inet_sport)); }
+static struct tcp_comp_context *comp_get_ctx(const struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + return (__force void *)icsk->icsk_ulp_data; +} + +static int tcp_comp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + + return ctx->sk_proto->sendmsg(sk, msg, size); +} + +static int tcp_comp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, int *addr_len) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + + return ctx->sk_proto->recvmsg(sk, msg, len, nonblock, flags, addr_len); +} + void tcp_init_compression(struct sock *sk) { + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_comp_context *ctx = NULL; struct tcp_sock *tp = tcp_sk(sk);
if (!tp->rx_opt.comp_ok) return;
+ ctx = kzalloc(sizeof(*ctx), GFP_ATOMIC); + if (!ctx) + return; + + ctx->sk_proto = sk->sk_prot; + WRITE_ONCE(sk->sk_prot, &tcp_prot_override); + + rcu_assign_pointer(icsk->icsk_ulp_data, ctx); + sock_set_flag(sk, SOCK_COMP); }
+static void tcp_comp_context_free(struct rcu_head *head) +{ + struct tcp_comp_context *ctx; + + ctx = container_of(head, struct tcp_comp_context, rcu); + + kfree(ctx); +} + void tcp_cleanup_compression(struct sock *sk) { + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_comp_context *ctx = comp_get_ctx(sk); + + if (!ctx || !sock_flag(sk, SOCK_COMP)) + return; + + rcu_assign_pointer(icsk->icsk_ulp_data, NULL); + call_rcu(&ctx->rcu, tcp_comp_context_free); +} + +int tcp_comp_init(void) +{ + tcp_prot_override = tcp_prot; + tcp_prot_override.sendmsg = tcp_comp_sendmsg; + tcp_prot_override.recvmsg = tcp_comp_recvmsg; + + return 0; }