From: Keefe LIU liuqifa@huawei.com
hulk inclusion category: feature bugzilla: 9511, https://gitee.com/openeuler/kernel/issues/I4IHL1 CVE: NA
-------------------------------------------------
In a typical IPvlan L2 setup where master is in default-ns and each slave is into different (slave) ns. In this setup, if master and slaves in different net, egress packet processing for traffic originating from slave-ns can't be forwarded to master or other machine whose ip in the same net with master, and they can't be forwarded to other interface in default-ns.
This patch introuce a new mode l2e for ipvlan to realize above goals, and it won't affect the original l2, l3, l3s mode.
As the ip tool doesn't support l2e mode, We use module param "ipvlan_default_mode" to set the default work mode. 0 for l2 mode, 1 for l3, 2 for l2e, 3 for l3s, others invalid now. Attention, when we create ipvlan devices by "ip" commond, if we assign the mode, ipvlan will work in the mode we assigned other then the "ipvlan_default_mode".
Signed-off-by: Keefe LIU liuqifa@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Modified: when insmod the ipvlan module, the origin version(4.19) is 0 for l2, 1 for l3, 2 for l2e, 3 for l3s mode, but in 5.10 version is 0 for l2, 1 for l3, 2 for l3s, 3 for l2e mode.
Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ipvlan/ipvlan_core.c | 190 +++++++++++++++++++++++++++++++ drivers/net/ipvlan/ipvlan_main.c | 6 +- include/uapi/linux/if_link.h | 1 + 3 files changed, 196 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index 8801d093135c..5b695ec5c650 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -532,6 +532,122 @@ static int ipvlan_process_outbound(struct sk_buff *skb) return ret; }
+static int ipvlan_process_v4_forward(struct sk_buff *skb) +{ + const struct iphdr *ip4h = ip_hdr(skb); + struct net_device *dev = skb->dev; + struct net *net = dev_net(dev); + struct rtable *rt; + int err, ret = NET_XMIT_DROP; + struct flowi4 fl4 = { + .flowi4_tos = RT_TOS(ip4h->tos), + .flowi4_flags = FLOWI_FLAG_ANYSRC, + .flowi4_mark = skb->mark, + .daddr = ip4h->daddr, + .saddr = ip4h->saddr, + }; + + rt = ip_route_output_flow(net, &fl4, NULL); + if (IS_ERR(rt)) + goto err; + + if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { + ip_rt_put(rt); + goto err; + } + skb_dst_set(skb, &rt->dst); + err = ip_local_out(net, skb->sk, skb); + if (unlikely(net_xmit_eval(err))) + dev->stats.tx_errors++; + else + ret = NET_XMIT_SUCCESS; + goto out; +err: + dev->stats.tx_errors++; + kfree_skb(skb); +out: + return ret; +} + +#if IS_ENABLED(CONFIG_IPV6) +static int ipvlan_process_v6_forward(struct sk_buff *skb) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct net_device *dev = skb->dev; + struct net *net = dev_net(dev); + struct dst_entry *dst; + int err, ret = NET_XMIT_DROP; + struct flowi6 fl6 = { + .daddr = ip6h->daddr, + .saddr = ip6h->saddr, + .flowi6_flags = FLOWI_FLAG_ANYSRC, + .flowlabel = ip6_flowinfo(ip6h), + .flowi6_mark = skb->mark, + .flowi6_proto = ip6h->nexthdr, + }; + + dst = ip6_route_output(net, NULL, &fl6); + if (dst->error) { + ret = dst->error; + dst_release(dst); + goto err; + } + skb_dst_set(skb, dst); + err = ip6_local_out(net, skb->sk, skb); + if (unlikely(net_xmit_eval(err))) + dev->stats.tx_errors++; + else + ret = NET_XMIT_SUCCESS; + goto out; +err: + dev->stats.tx_errors++; + kfree_skb(skb); +out: + return ret; +} +#else +static int ipvlan_process_v6_forward(struct sk_buff *skb) +{ + return NET_XMIT_DROP; +} +#endif + +static int ipvlan_process_forward(struct sk_buff *skb) +{ + struct ethhdr *ethh = eth_hdr(skb); + int ret = NET_XMIT_DROP; + + /* In this mode we dont care about multicast and broadcast traffic */ + if (is_multicast_ether_addr(ethh->h_dest)) { + pr_debug_ratelimited("Dropped {multi|broad}cast of type=[%x]\n", + ntohs(skb->protocol)); + kfree_skb(skb); + goto out; + } + + /* The ipvlan is a pseudo-L2 device, so the packets that we receive + * will have L2; which need to discarded and processed further + * in the net-ns of the main-device. + */ + if (skb_mac_header_was_set(skb)) { + skb_pull(skb, sizeof(*ethh)); + skb->mac_header = (typeof(skb->mac_header))~0U; + skb_reset_network_header(skb); + } + + if (skb->protocol == htons(ETH_P_IPV6)) { + ret = ipvlan_process_v6_forward(skb); + } else if (skb->protocol == htons(ETH_P_IP)) { + ret = ipvlan_process_v4_forward(skb); + } else { + pr_warn_ratelimited("Dropped outbound packet type=%x\n", + ntohs(skb->protocol)); + kfree_skb(skb); + } +out: + return ret; +} + static void ipvlan_multicast_enqueue(struct ipvl_port *port, struct sk_buff *skb, bool tx_pkt) { @@ -629,6 +745,46 @@ static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev) return dev_queue_xmit(skb); }
+static int ipvlan_xmit_mode_l2e(struct sk_buff *skb, struct net_device *dev) +{ + const struct ipvl_dev *ipvlan = netdev_priv(dev); + struct ethhdr *eth = eth_hdr(skb); + struct ipvl_addr *addr; + void *lyr3h; + int addr_type; + + if (!ipvlan_is_vepa(ipvlan->port) && + ether_addr_equal(eth->h_dest, eth->h_source)) { + lyr3h = ipvlan_get_L3_hdr(ipvlan->port, skb, &addr_type); + if (lyr3h) { + addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, + addr_type, true); + if (addr) { + if (ipvlan_is_private(ipvlan->port)) { + consume_skb(skb); + return NET_XMIT_DROP; + } + return ipvlan_rcv_frame(addr, &skb, true); + } + } + skb = skb_share_check(skb, GFP_ATOMIC); + if (!skb) + return NET_XMIT_DROP; + + /* maybe the packet need been forward */ + skb->dev = ipvlan->phy_dev; + ipvlan_skb_crossing_ns(skb, ipvlan->phy_dev); + return ipvlan_process_forward(skb); + } else if (is_multicast_ether_addr(eth->h_dest)) { + ipvlan_skb_crossing_ns(skb, NULL); + ipvlan_multicast_enqueue(ipvlan->port, skb, true); + return NET_XMIT_SUCCESS; + } + + skb->dev = ipvlan->phy_dev; + return dev_queue_xmit(skb); +} + int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); @@ -648,6 +804,8 @@ int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) case IPVLAN_MODE_L3S: #endif return ipvlan_xmit_mode_l3(skb, dev); + case IPVLAN_MODE_L2E: + return ipvlan_xmit_mode_l2e(skb, dev); }
/* Should not reach here */ @@ -729,6 +887,36 @@ static rx_handler_result_t ipvlan_handle_mode_l2(struct sk_buff **pskb, return ret; }
+static rx_handler_result_t ipvlan_handle_mode_l2e(struct sk_buff **pskb, + struct ipvl_port *port) +{ + struct sk_buff *skb = *pskb; + struct ethhdr *eth = eth_hdr(skb); + rx_handler_result_t ret = RX_HANDLER_PASS; + + if (is_multicast_ether_addr(eth->h_dest)) { + if (ipvlan_external_frame(skb, port)) { + struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); + + /* External frames are queued for device local + * distribution, but a copy is given to master + * straight away to avoid sending duplicates later + * when work-queue processes this frame. This is + * achieved by returning RX_HANDLER_PASS. + */ + if (nskb) { + ipvlan_skb_crossing_ns(nskb, NULL); + ipvlan_multicast_enqueue(port, nskb, false); + } + } + } else { + /* Perform like l3 mode for non-multicast packet */ + ret = ipvlan_handle_mode_l3(pskb, port); + } + + return ret; +} + rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; @@ -746,6 +934,8 @@ rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb) case IPVLAN_MODE_L3S: return RX_HANDLER_PASS; #endif + case IPVLAN_MODE_L2E: + return ipvlan_handle_mode_l2e(pskb, port); }
/* Should not reach here */ diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 60b7d93bb834..efd1452cd929 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -4,6 +4,10 @@
#include "ipvlan.h"
+static int ipvlan_default_mode = IPVLAN_MODE_L3; +module_param(ipvlan_default_mode, int, 0400); +MODULE_PARM_DESC(ipvlan_default_mode, "set ipvlan default mode: 0 for l2, 1 for l3, 2 for l3s, 3 for l2e, others invalid now"); + static int ipvlan_set_port_mode(struct ipvl_port *port, u16 nval, struct netlink_ext_ack *extack) { @@ -535,7 +539,7 @@ int ipvlan_link_new(struct net *src_net, struct net_device *dev, struct ipvl_port *port; struct net_device *phy_dev; int err; - u16 mode = IPVLAN_MODE_L3; + u16 mode = ipvlan_default_mode;
if (!tb[IFLA_LINK]) return -EINVAL; diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index c4b23f06f69e..50d4705e1cbc 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -690,6 +690,7 @@ enum ipvlan_mode { IPVLAN_MODE_L2 = 0, IPVLAN_MODE_L3, IPVLAN_MODE_L3S, + IPVLAN_MODE_L2E, IPVLAN_MODE_MAX };
From: Keefe LIU liuqifa@huawei.com
hulk inclusion category: bugfix bugzilla: 9511, https://gitee.com/openeuler/kernel/issues/I4IHL1 CVE: NA
-------------------------------------------------
In commit <7a0f243de9e2> "ipvlan: Introduce l2e mode", we introduced ipvlan_default_mode as a way to enable the ipvlan's default mode. Howerver, we didn't check the value of this module parameter.
This patch first fixed out a spelling error of "mode", and then add the value check for ipvlan_default_mode.
Signed-off-by: Keefe LIU liuqifa@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ipvlan/ipvlan_main.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index efd1452cd929..bf750140f45a 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -1026,6 +1026,10 @@ static int __init ipvlan_init_module(void) { int err;
+ if (ipvlan_default_mode >= IPVLAN_MODE_MAX || + ipvlan_default_mode < IPVLAN_MODE_L2) + return -EINVAL; + ipvlan_init_secret(); register_netdevice_notifier(&ipvlan_notifier_block); #if IS_ENABLED(CONFIG_IPV6)
From: Keefe LIU liuqifa@huawei.com
hulk inclusion category: feature bugzilla: 9511, https://gitee.com/openeuler/kernel/issues/I4IHL1 CVE: NA
-------------------------------------------------
Consider two IPVlan devices are set up on the same master, when they communicate with each other by TCP, the receive part is too fast to make the send packets coalesced, so in this case, the performace is not as good as we expect.
This patch introduces a local xmit queue for l2e mode, when the packets are sent to the IPVlan devices of the same master, the packets will be cloned and added to the local xmit queue, this operation can make the send packets coalesced and improve the TCP performace in this case.
Signed-off-by: Keefe LIU liuqifa@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ipvlan/ipvlan.h | 7 ++++ drivers/net/ipvlan/ipvlan_core.c | 34 ++++++++++++++++- drivers/net/ipvlan/ipvlan_main.c | 65 ++++++++++++++++++++++++++++++++ 3 files changed, 105 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h index 3837c897832e..6796e742c470 100644 --- a/drivers/net/ipvlan/ipvlan.h +++ b/drivers/net/ipvlan/ipvlan.h @@ -39,6 +39,9 @@
#define IPVLAN_QBACKLOG_LIMIT 1000
+extern int sysctl_ipvlan_loop_qlen; +extern int sysctl_ipvlan_loop_delay; + typedef enum { IPVL_IPV6 = 0, IPVL_ICMPV6, @@ -70,6 +73,10 @@ struct ipvl_dev { netdev_features_t sfeatures; u32 msg_enable; spinlock_t addrs_lock; + int local_packets_cached; + unsigned long local_timeout; + struct timer_list local_free_timer; + struct sk_buff_head local_xmit_queue; };
struct ipvl_addr { diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index 5b695ec5c650..8de1f58d2fab 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -745,9 +745,37 @@ static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev) return dev_queue_xmit(skb); }
+static int ipvlan_l2e_local_xmit_event(struct ipvl_dev *ipvlan, + struct sk_buff **pskb) +{ + struct sk_buff *nskb, *tskb; + + while ((ipvlan->local_packets_cached >= sysctl_ipvlan_loop_qlen) && + (tskb = skb_dequeue(&ipvlan->local_xmit_queue))) { + ipvlan->local_packets_cached -= tskb->truesize; + if (ipvlan->local_packets_cached < 0 || + skb_queue_empty(&ipvlan->local_xmit_queue)) + ipvlan->local_packets_cached = 0; + kfree_skb(tskb); + } + + nskb = skb_clone(*pskb, GFP_ATOMIC); + if (!nskb) + return NET_XMIT_DROP; + + ipvlan->local_timeout = jiffies + + (sysctl_ipvlan_loop_delay * HZ) / 1000; + mod_timer(&ipvlan->local_free_timer, ipvlan->local_timeout); + skb_queue_tail(&ipvlan->local_xmit_queue, *pskb); + ipvlan->local_packets_cached += (*pskb)->truesize; + *pskb = nskb; + + return 0; +} + static int ipvlan_xmit_mode_l2e(struct sk_buff *skb, struct net_device *dev) { - const struct ipvl_dev *ipvlan = netdev_priv(dev); + struct ipvl_dev *ipvlan = netdev_priv(dev); struct ethhdr *eth = eth_hdr(skb); struct ipvl_addr *addr; void *lyr3h; @@ -764,6 +792,10 @@ static int ipvlan_xmit_mode_l2e(struct sk_buff *skb, struct net_device *dev) consume_skb(skb); return NET_XMIT_DROP; } + + if (unlikely(ipvlan_l2e_local_xmit_event(ipvlan, + &skb))) + return NET_XMIT_DROP; return ipvlan_rcv_frame(addr, &skb, true); } } diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index bf750140f45a..de256360b1ac 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -4,10 +4,43 @@
#include "ipvlan.h"
+int sysctl_ipvlan_loop_qlen = 131072; +int sysctl_ipvlan_loop_delay = 10; static int ipvlan_default_mode = IPVLAN_MODE_L3; module_param(ipvlan_default_mode, int, 0400); MODULE_PARM_DESC(ipvlan_default_mode, "set ipvlan default mode: 0 for l2, 1 for l3, 2 for l3s, 3 for l2e, others invalid now");
+static struct ctl_table_header *ipvlan_table_hrd; +static struct ctl_table ipvlan_table[] = { + { + .procname = "loop_delay", + .data = &sysctl_ipvlan_loop_delay, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "loop_qlen", + .data = &sysctl_ipvlan_loop_qlen, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static int ipvlan_sysctl_init(void) +{ + ipvlan_table_hrd = register_net_sysctl(&init_net, + "net/ipvlan", ipvlan_table); + return !ipvlan_table_hrd ? -ENOMEM : 0; +} + +static void ipvlan_sysctl_exit(void) +{ + unregister_net_sysctl_table(ipvlan_table_hrd); +} + static int ipvlan_set_port_mode(struct ipvl_port *port, u16 nval, struct netlink_ext_ack *extack) { @@ -164,6 +197,32 @@ static int ipvlan_init(struct net_device *dev) return 0; }
+static void ipvlan_local_free_handler(struct timer_list *t) +{ + struct ipvl_dev *ipvlan = from_timer(ipvlan, t, local_free_timer); + + skb_queue_purge(&ipvlan->local_xmit_queue); + ipvlan->local_packets_cached = 0; +} + +static inline void ipvlan_local_init(struct net_device *dev) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + + ipvlan->local_packets_cached = 0; + skb_queue_head_init(&ipvlan->local_xmit_queue); + timer_setup(&ipvlan->local_free_timer, + ipvlan_local_free_handler, 0); +} + +static inline void ipvlan_local_uninit(struct net_device *dev) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + + del_timer(&ipvlan->local_free_timer); + skb_queue_purge(&ipvlan->local_xmit_queue); +} + static void ipvlan_uninit(struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); @@ -189,6 +248,7 @@ static int ipvlan_open(struct net_device *dev) else dev->flags &= ~IFF_NOARP;
+ ipvlan_local_init(dev); rcu_read_lock(); list_for_each_entry_rcu(addr, &ipvlan->addrs, anode) ipvlan_ht_addr_add(ipvlan, addr); @@ -206,6 +266,7 @@ static int ipvlan_stop(struct net_device *dev) dev_uc_unsync(phy_dev, dev); dev_mc_unsync(phy_dev, dev);
+ ipvlan_local_uninit(dev); rcu_read_lock(); list_for_each_entry_rcu(addr, &ipvlan->addrs, anode) ipvlan_ht_addr_del(addr); @@ -1050,6 +1111,9 @@ static int __init ipvlan_init_module(void) goto error; }
+ err = ipvlan_sysctl_init(); + if (err < 0) + pr_err("ipvlan proc init failed, continue\n"); return 0; error: unregister_inetaddr_notifier(&ipvlan_addr4_notifier_block); @@ -1077,6 +1141,7 @@ static void __exit ipvlan_cleanup_module(void) unregister_inet6addr_validator_notifier( &ipvlan_addr6_vtor_notifier_block); #endif + ipvlan_sysctl_exit(); }
module_init(ipvlan_init_module);
From: Keefe Liu liuqifa@huawei.com
hulk inclusion category: bugfix bugzilla: 4411, https://gitee.com/openeuler/kernel/issues/I4IHL1 CVE: NA
-------------------------------------------------
In order to avoid integer overflow, we should limit the ranges of loop_delay value.
Fixes: f4661458116b ("ipvlan: Introduce local xmit queue for l2e mode") Signed-off-by: Keefe Liu liuqifa@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ipvlan/ipvlan_main.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index de256360b1ac..03812c95356b 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -4,6 +4,9 @@
#include "ipvlan.h"
+static int one = 1; +static int delay_max = 100; + int sysctl_ipvlan_loop_qlen = 131072; int sysctl_ipvlan_loop_delay = 10; static int ipvlan_default_mode = IPVLAN_MODE_L3; @@ -17,7 +20,9 @@ static struct ctl_table ipvlan_table[] = { .data = &sysctl_ipvlan_loop_delay, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &delay_max, }, { .procname = "loop_qlen",
From: Keefe LIU liuqifa@huawei.com
hulk inclusion category: bugfix bugzilla: 4411, https://gitee.com/openeuler/kernel/issues/I4IHL1 CVE: NA
-------------------------------------------------
In order to avoid integer overflow, we should limit the ranges of loop_qlen value.
Fixes: 997518dea253 ("ipvlan: Introduce local xmit queue for l2e mode") Signed-off-by: Keefe Liu liuqifa@huawei.com Reviewed-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ipvlan/ipvlan_main.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 03812c95356b..d6edc6cf057c 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -6,6 +6,9 @@
static int one = 1; static int delay_max = 100; +/* set loop queue length from 0 to 10 big packets(65536) */ +static int qlen_min; +static int qlen_max = 655360;
int sysctl_ipvlan_loop_qlen = 131072; int sysctl_ipvlan_loop_delay = 10; @@ -29,7 +32,9 @@ static struct ctl_table ipvlan_table[] = { .data = &sysctl_ipvlan_loop_qlen, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &qlen_min, + .extra2 = &qlen_max, }, { } };
From: 沈子俊 shenzijun@kylinos.cn
mainline inclusion from mainline-v5.15-rc1 commit 7b3d52683b3a47c0ba1dfd6b5994a3a795b06972 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4A82T?from=project-issue CVE: NA
---------------------------------------------------------------
There are several places where the return value check of crypto_aead_setkey and crypto_aead_setauthsize were lost. It is necessary to add these checks.
At the same time, move the crypto_aead_setauthsize() call out of the loop, and only need to call it once after load transform.
Fixee: 53f52d7aecb4 ("crypto: tcrypt - Added speed tests for AEAD crypto alogrithms in tcrypt test suite") Signed-off-by: Tianjia Zhang tianjia.zhang@linux.alibaba.com Reviewed-by: Vitaly Chikunov vt@altlinux.org Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Signed-off-by: 沈子俊 shenzijun@kylinos.cn Reviewed-by: weiyang wang wangweiyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- crypto/tcrypt.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-)
diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index 8609174e036e..b1d26931621f 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -290,6 +290,11 @@ static void test_mb_aead_speed(const char *algo, int enc, int secs, }
ret = crypto_aead_setauthsize(tfm, authsize); + if (ret) { + pr_err("alg: aead: Failed to setauthsize for %s: %d\n", algo, + ret); + goto out_free_tfm; + }
for (i = 0; i < num_mb; ++i) if (testmgr_alloc_buf(data[i].xbuf)) { @@ -315,7 +320,7 @@ static void test_mb_aead_speed(const char *algo, int enc, int secs, for (i = 0; i < num_mb; ++i) { data[i].req = aead_request_alloc(tfm, GFP_KERNEL); if (!data[i].req) { - pr_err("alg: skcipher: Failed to allocate request for %s\n", + pr_err("alg: aead: Failed to allocate request for %s\n", algo); while (i--) aead_request_free(data[i].req); @@ -572,6 +577,13 @@ static void test_aead_speed(const char *algo, int enc, unsigned int secs, goto out_notfm; }
+ ret = crypto_aead_setauthsize(tfm, authsize); + if (ret) { + pr_err("alg: aead: Failed to setauthsize for %s: %d\n", algo, + ret); + goto out_noreq; + } + crypto_init_wait(&wait); printk(KERN_INFO "\ntesting speed of %s (%s) %s\n", algo, get_driver_name(crypto_aead, tfm), e); @@ -607,8 +619,13 @@ static void test_aead_speed(const char *algo, int enc, unsigned int secs, break; } } + ret = crypto_aead_setkey(tfm, key, *keysize); - ret = crypto_aead_setauthsize(tfm, authsize); + if (ret) { + pr_err("setkey() failed flags=%x\n", + crypto_aead_get_flags(tfm)); + goto out; + }
iv_len = crypto_aead_ivsize(tfm); if (iv_len) @@ -618,15 +635,8 @@ static void test_aead_speed(const char *algo, int enc, unsigned int secs, printk(KERN_INFO "test %u (%d bit key, %d byte blocks): ", i, *keysize * 8, *b_size);
- memset(tvmem[0], 0xff, PAGE_SIZE);
- if (ret) { - pr_err("setkey() failed flags=%x\n", - crypto_aead_get_flags(tfm)); - goto out; - } - sg_init_aead(sg, xbuf, *b_size + (enc ? 0 : authsize), assoc, aad_size);
From: 沈子俊 shenzijun@kylinos.cn
mainline inclusion from mainline-v5.15-rc1 commit 68039d605f7bb34ea6dbd4e099bf98599d52b0ac category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4A82T?from=project-issue CVE: NA
---------------------------------------------------------------
The GCM/CCM mode of the SM4 algorithm is defined in the rfc 8998 specification, and the test case data also comes from rfc 8998.
Signed-off-by: Tianjia Zhang tianjia.zhang@linux.alibaba.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Signed-off-by: 沈子俊 shenzijun@kylinos.cn Reviewed-by: weiyang wang wangweiyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- crypto/testmgr.c | 29 ++++++++++ crypto/testmgr.h | 148 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 177 insertions(+)
diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 196bdf0b0dc6..c052539be5d6 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -4465,6 +4465,12 @@ static const struct alg_test_desc alg_test_descs[] = { .suite = { .hash = __VECS(aes_cbcmac_tv_template) } + }, { + .alg = "cbcmac(sm4)", + .test = alg_test_hash, + .suite = { + .hash = __VECS(sm4_cbcmac_tv_template) + } }, { .alg = "ccm(aes)", .generic_driver = "ccm_base(ctr(aes-generic),cbcmac(aes-generic))", @@ -4476,6 +4482,16 @@ static const struct alg_test_desc alg_test_descs[] = { .einval_allowed = 1, } } + }, { + .alg = "ccm(sm4)", + .generic_driver = "ccm_base(ctr(sm4-generic),cbcmac(sm4-generic))", + .test = alg_test_aead, + .suite = { + .aead = { + ____VECS(sm4_ccm_tv_template), + .einval_allowed = 1, + } + } }, { .alg = "cfb(aes)", .test = alg_test_skcipher, @@ -4509,6 +4525,12 @@ static const struct alg_test_desc alg_test_descs[] = { .suite = { .hash = __VECS(des3_ede_cmac64_tv_template) } + }, { + .alg = "cmac(sm4)", + .test = alg_test_hash, + .suite = { + .hash = __VECS(sm4_cmac128_tv_template) + } }, { .alg = "compress_null", .test = alg_test_null, @@ -4985,6 +5007,13 @@ static const struct alg_test_desc alg_test_descs[] = { .suite = { .aead = __VECS(aes_gcm_tv_template) } + }, { + .alg = "gcm(sm4)", + .generic_driver = "gcm_base(ctr(sm4-generic),ghash-generic)", + .test = alg_test_aead, + .suite = { + .aead = __VECS(sm4_gcm_tv_template) + } }, { .alg = "ghash", .test = alg_test_hash, diff --git a/crypto/testmgr.h b/crypto/testmgr.h index d6d0853fe74b..48dc1b6cc498 100644 --- a/crypto/testmgr.h +++ b/crypto/testmgr.h @@ -13798,6 +13798,154 @@ static const struct cipher_testvec sm4_cfb_tv_template[] = { } };
+static const struct aead_testvec sm4_gcm_tv_template[] = { + { /* From https://datatracker.ietf.org/doc/html/rfc8998#appendix-A.1 */ + .key = "\x01\x23\x45\x67\x89\xAB\xCD\xEF" + "\xFE\xDC\xBA\x98\x76\x54\x32\x10", + .klen = 16, + .iv = "\x00\x00\x12\x34\x56\x78\x00\x00" + "\x00\x00\xAB\xCD", + .ptext = "\xAA\xAA\xAA\xAA\xAA\xAA\xAA\xAA" + "\xBB\xBB\xBB\xBB\xBB\xBB\xBB\xBB" + "\xCC\xCC\xCC\xCC\xCC\xCC\xCC\xCC" + "\xDD\xDD\xDD\xDD\xDD\xDD\xDD\xDD" + "\xEE\xEE\xEE\xEE\xEE\xEE\xEE\xEE" + "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF" + "\xEE\xEE\xEE\xEE\xEE\xEE\xEE\xEE" + "\xAA\xAA\xAA\xAA\xAA\xAA\xAA\xAA", + .plen = 64, + .assoc = "\xFE\xED\xFA\xCE\xDE\xAD\xBE\xEF" + "\xFE\xED\xFA\xCE\xDE\xAD\xBE\xEF" + "\xAB\xAD\xDA\xD2", + .alen = 20, + .ctext = "\x17\xF3\x99\xF0\x8C\x67\xD5\xEE" + "\x19\xD0\xDC\x99\x69\xC4\xBB\x7D" + "\x5F\xD4\x6F\xD3\x75\x64\x89\x06" + "\x91\x57\xB2\x82\xBB\x20\x07\x35" + "\xD8\x27\x10\xCA\x5C\x22\xF0\xCC" + "\xFA\x7C\xBF\x93\xD4\x96\xAC\x15" + "\xA5\x68\x34\xCB\xCF\x98\xC3\x97" + "\xB4\x02\x4A\x26\x91\x23\x3B\x8D" + "\x83\xDE\x35\x41\xE4\xC2\xB5\x81" + "\x77\xE0\x65\xA9\xBF\x7B\x62\xEC", + .clen = 80, + } +}; + +static const struct aead_testvec sm4_ccm_tv_template[] = { + { /* From https://datatracker.ietf.org/doc/html/rfc8998#appendix-A.2 */ + .key = "\x01\x23\x45\x67\x89\xAB\xCD\xEF" + "\xFE\xDC\xBA\x98\x76\x54\x32\x10", + .klen = 16, + .iv = "\x02\x00\x00\x12\x34\x56\x78\x00" + "\x00\x00\x00\xAB\xCD\x00\x00\x00", + .ptext = "\xAA\xAA\xAA\xAA\xAA\xAA\xAA\xAA" + "\xBB\xBB\xBB\xBB\xBB\xBB\xBB\xBB" + "\xCC\xCC\xCC\xCC\xCC\xCC\xCC\xCC" + "\xDD\xDD\xDD\xDD\xDD\xDD\xDD\xDD" + "\xEE\xEE\xEE\xEE\xEE\xEE\xEE\xEE" + "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF" + "\xEE\xEE\xEE\xEE\xEE\xEE\xEE\xEE" + "\xAA\xAA\xAA\xAA\xAA\xAA\xAA\xAA", + .plen = 64, + .assoc = "\xFE\xED\xFA\xCE\xDE\xAD\xBE\xEF" + "\xFE\xED\xFA\xCE\xDE\xAD\xBE\xEF" + "\xAB\xAD\xDA\xD2", + .alen = 20, + .ctext = "\x48\xAF\x93\x50\x1F\xA6\x2A\xDB" + "\xCD\x41\x4C\xCE\x60\x34\xD8\x95" + "\xDD\xA1\xBF\x8F\x13\x2F\x04\x20" + "\x98\x66\x15\x72\xE7\x48\x30\x94" + "\xFD\x12\xE5\x18\xCE\x06\x2C\x98" + "\xAC\xEE\x28\xD9\x5D\xF4\x41\x6B" + "\xED\x31\xA2\xF0\x44\x76\xC1\x8B" + "\xB4\x0C\x84\xA7\x4B\x97\xDC\x5B" + "\x16\x84\x2D\x4F\xA1\x86\xF5\x6A" + "\xB3\x32\x56\x97\x1F\xA1\x10\xF4", + .clen = 80, + } +}; + +static const struct hash_testvec sm4_cbcmac_tv_template[] = { + { + .key = "\xff\xee\xdd\xcc\xbb\xaa\x99\x88" + "\x77\x66\x55\x44\x33\x22\x11\x00", + .plaintext = "\x01\x23\x45\x67\x89\xab\xcd\xef" + "\xfe\xdc\xba\x98\x76\x54\x32\x10", + .digest = "\x97\xb4\x75\x8f\x84\x92\x3d\x3f" + "\x86\x81\x0e\x0e\xea\x14\x6d\x73", + .psize = 16, + .ksize = 16, + }, { + .key = "\x01\x23\x45\x67\x89\xab\xcd\xef" + "\xfe\xdc\xBA\x98\x76\x54\x32\x10", + .plaintext = "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa" + "\xbb\xbb\xbb\xbb\xbb\xbb\xbb\xbb" + "\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc" + "\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd" + "\xee", + .digest = "\xc7\xdb\x17\x71\xa1\x5c\x0d\x22" + "\xa3\x39\x3a\x31\x88\x91\x49\xa1", + .psize = 33, + .ksize = 16, + }, { + .key = "\x01\x23\x45\x67\x89\xab\xcd\xef" + "\xfe\xdc\xBA\x98\x76\x54\x32\x10", + .plaintext = "\xfb\xd1\xbe\x92\x7e\x50\x3f\x16" + "\xf9\xdd\xbe\x91\x73\x53\x37\x1a" + "\xfe\xdd\xba\x97\x7e\x53\x3c\x1c" + "\xfe\xd7\xbf\x9c\x75\x5f\x3e\x11" + "\xf0\xd8\xbc\x96\x73\x5c\x34\x11" + "\xf5\xdb\xb1\x99\x7a\x5a\x32\x1f" + "\xf6\xdf\xb4\x95\x7f\x5f\x3b\x17" + "\xfd\xdb\xb1\x9b\x76\x5c\x37", + .digest = "\x9b\x07\x88\x7f\xd5\x95\x23\x12" + "\x64\x0a\x66\x7f\x4e\x25\xca\xd0", + .psize = 63, + .ksize = 16, + } +}; + +static const struct hash_testvec sm4_cmac128_tv_template[] = { + { + .key = "\xff\xee\xdd\xcc\xbb\xaa\x99\x88" + "\x77\x66\x55\x44\x33\x22\x11\x00", + .plaintext = "\x01\x23\x45\x67\x89\xab\xcd\xef" + "\xfe\xdc\xba\x98\x76\x54\x32\x10", + .digest = "\x00\xd4\x63\xb4\x9a\xf3\x52\xe2" + "\x74\xa9\x00\x55\x13\x54\x2a\xd1", + .psize = 16, + .ksize = 16, + }, { + .key = "\x01\x23\x45\x67\x89\xab\xcd\xef" + "\xfe\xdc\xBA\x98\x76\x54\x32\x10", + .plaintext = "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa" + "\xbb\xbb\xbb\xbb\xbb\xbb\xbb\xbb" + "\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc" + "\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd" + "\xee", + .digest = "\x8a\x8a\xe9\xc0\xc8\x97\x0e\x85" + "\x21\x57\x02\x10\x1a\xbf\x9c\xc6", + .psize = 33, + .ksize = 16, + }, { + .key = "\x01\x23\x45\x67\x89\xab\xcd\xef" + "\xfe\xdc\xBA\x98\x76\x54\x32\x10", + .plaintext = "\xfb\xd1\xbe\x92\x7e\x50\x3f\x16" + "\xf9\xdd\xbe\x91\x73\x53\x37\x1a" + "\xfe\xdd\xba\x97\x7e\x53\x3c\x1c" + "\xfe\xd7\xbf\x9c\x75\x5f\x3e\x11" + "\xf0\xd8\xbc\x96\x73\x5c\x34\x11" + "\xf5\xdb\xb1\x99\x7a\x5a\x32\x1f" + "\xf6\xdf\xb4\x95\x7f\x5f\x3b\x17" + "\xfd\xdb\xb1\x9b\x76\x5c\x37", + .digest = "\x5f\x14\xc9\xa9\x20\xb2\xb4\xf0" + "\x76\xe0\xd8\xd6\xdc\x4f\xe1\xbc", + .psize = 63, + .ksize = 16, + } +}; + /* Cast6 test vectors from RFC 2612 */ static const struct cipher_testvec cast6_tv_template[] = { {
From: 沈子俊 shenzijun@kylinos.cn
mainline inclusion from mainline-v5.15-rc1 commit 357a753f5ec7ccdec196fa825d906c3acc4bd57c category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4A82T?from=project-issue CVE: NA
---------------------------------------------------
tcrypt supports GCM/CCM mode, CMAC, CBCMAC, and speed test of SM4 algorithm.
Signed-off-by: Tianjia Zhang tianjia.zhang@linux.alibaba.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Signed-off-by: 沈子俊 shenzijun@kylinos.cn Reviewed-by: weiyang wang wangweiyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- crypto/tcrypt.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+)
diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index b1d26931621f..5f9395a28c97 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -1929,6 +1929,14 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb) case 54: ret += tcrypt_test("streebog512"); break; + + case 55: + ret += tcrypt_test("gcm(sm4)"); + break; + + case 56: + ret += tcrypt_test("ccm(sm4)"); + break;
case 100: ret += tcrypt_test("hmac(md5)"); @@ -2025,6 +2033,15 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb) case 157: ret += tcrypt_test("authenc(hmac(sha1),ecb(cipher_null))"); break; + + case 158: + ret += tcrypt_test("cbcmac(sm4)"); + break; + + case 159: + ret += tcrypt_test("cmac(sm4)"); + break; + case 181: ret += tcrypt_test("authenc(hmac(sha1),cbc(des))"); break; @@ -2354,6 +2371,34 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb) NULL, 0, 16, 8, speed_template_16); break;
+ case 222: + test_aead_speed("gcm(sm4)", ENCRYPT, sec, + NULL, 0, 16, 8, speed_template_16); + test_aead_speed("gcm(sm4)", DECRYPT, sec, + NULL, 0, 16, 8, speed_template_16); + break; + + case 223: + test_aead_speed("rfc4309(ccm(sm4))", ENCRYPT, sec, + NULL, 0, 16, 16, aead_speed_template_19); + test_aead_speed("rfc4309(ccm(sm4))", DECRYPT, sec, + NULL, 0, 16, 16, aead_speed_template_19); + break; + + case 224: + test_mb_aead_speed("gcm(sm4)", ENCRYPT, sec, NULL, 0, 16, 8, + speed_template_16, num_mb); + test_mb_aead_speed("gcm(sm4)", DECRYPT, sec, NULL, 0, 16, 8, + speed_template_16, num_mb); + break; + + case 225: + test_mb_aead_speed("rfc4309(ccm(sm4))", ENCRYPT, sec, NULL, 0, + 16, 16, aead_speed_template_19, num_mb); + test_mb_aead_speed("rfc4309(ccm(sm4))", DECRYPT, sec, NULL, 0, + 16, 16, aead_speed_template_19, num_mb); + break; + case 300: if (alg) { test_hash_speed(alg, sec, generic_hash_speed_template);
From: 沈子俊 shenzijun@kylinos.cn
mainline inclusion from mainline-v5.15.3 commit 2b31277af577b1b2da62c3ad7d3315b422869102 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4A842?from=project-issue CVE: NA
------------------------------------------------------------------
Take the existing small footprint and mostly time invariant C code and turn it into a SM4 library that can be used for non-performance critical, casual use of SM4, and as a fallback for, e.g., SIMD code that needs a secondary path that can be taken in contexts where the SIMD unit is off limits.
Secondly, some codes have been optimized, such as unrolling small times loop, removing unnecessary memory shifts, exporting sbox, fk, ck arrays, and basic encryption and decryption functions.
Signed-off-by: Tianjia Zhang tianjia.zhang@linux.alibaba.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Signed-off-by: 沈子俊 shenzijun@kylinos.cn Reviewed-by: weiyang wang wangweiyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- crypto/Kconfig | 1 + crypto/sm4_generic.c | 153 +------------------------------------ include/crypto/sm4.h | 24 +++++- lib/crypto/Kconfig | 3 + lib/crypto/Makefile | 3 + lib/crypto/sm4.c | 176 +++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 208 insertions(+), 152 deletions(-) create mode 100644 lib/crypto/sm4.c
diff --git a/crypto/Kconfig b/crypto/Kconfig index 1629cd737b78..b6b30ac9d75d 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1619,6 +1619,7 @@ config CRYPTO_SERPENT_AVX2_X86_64 config CRYPTO_SM4 tristate "SM4 cipher algorithm" select CRYPTO_ALGAPI + select CRYPTO_LIB_SM4 help SM4 cipher algorithms (OSCCA GB/T 32907-2016).
diff --git a/crypto/sm4_generic.c b/crypto/sm4_generic.c index 016dbc595705..d19d01f852a9 100644 --- a/crypto/sm4_generic.c +++ b/crypto/sm4_generic.c @@ -16,139 +16,13 @@ #include <asm/byteorder.h> #include <asm/unaligned.h>
-static const u32 fk[4] = { - 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc -}; - -static const u8 sbox[256] = { - 0xd6, 0x90, 0xe9, 0xfe, 0xcc, 0xe1, 0x3d, 0xb7, - 0x16, 0xb6, 0x14, 0xc2, 0x28, 0xfb, 0x2c, 0x05, - 0x2b, 0x67, 0x9a, 0x76, 0x2a, 0xbe, 0x04, 0xc3, - 0xaa, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99, - 0x9c, 0x42, 0x50, 0xf4, 0x91, 0xef, 0x98, 0x7a, - 0x33, 0x54, 0x0b, 0x43, 0xed, 0xcf, 0xac, 0x62, - 0xe4, 0xb3, 0x1c, 0xa9, 0xc9, 0x08, 0xe8, 0x95, - 0x80, 0xdf, 0x94, 0xfa, 0x75, 0x8f, 0x3f, 0xa6, - 0x47, 0x07, 0xa7, 0xfc, 0xf3, 0x73, 0x17, 0xba, - 0x83, 0x59, 0x3c, 0x19, 0xe6, 0x85, 0x4f, 0xa8, - 0x68, 0x6b, 0x81, 0xb2, 0x71, 0x64, 0xda, 0x8b, - 0xf8, 0xeb, 0x0f, 0x4b, 0x70, 0x56, 0x9d, 0x35, - 0x1e, 0x24, 0x0e, 0x5e, 0x63, 0x58, 0xd1, 0xa2, - 0x25, 0x22, 0x7c, 0x3b, 0x01, 0x21, 0x78, 0x87, - 0xd4, 0x00, 0x46, 0x57, 0x9f, 0xd3, 0x27, 0x52, - 0x4c, 0x36, 0x02, 0xe7, 0xa0, 0xc4, 0xc8, 0x9e, - 0xea, 0xbf, 0x8a, 0xd2, 0x40, 0xc7, 0x38, 0xb5, - 0xa3, 0xf7, 0xf2, 0xce, 0xf9, 0x61, 0x15, 0xa1, - 0xe0, 0xae, 0x5d, 0xa4, 0x9b, 0x34, 0x1a, 0x55, - 0xad, 0x93, 0x32, 0x30, 0xf5, 0x8c, 0xb1, 0xe3, - 0x1d, 0xf6, 0xe2, 0x2e, 0x82, 0x66, 0xca, 0x60, - 0xc0, 0x29, 0x23, 0xab, 0x0d, 0x53, 0x4e, 0x6f, - 0xd5, 0xdb, 0x37, 0x45, 0xde, 0xfd, 0x8e, 0x2f, - 0x03, 0xff, 0x6a, 0x72, 0x6d, 0x6c, 0x5b, 0x51, - 0x8d, 0x1b, 0xaf, 0x92, 0xbb, 0xdd, 0xbc, 0x7f, - 0x11, 0xd9, 0x5c, 0x41, 0x1f, 0x10, 0x5a, 0xd8, - 0x0a, 0xc1, 0x31, 0x88, 0xa5, 0xcd, 0x7b, 0xbd, - 0x2d, 0x74, 0xd0, 0x12, 0xb8, 0xe5, 0xb4, 0xb0, - 0x89, 0x69, 0x97, 0x4a, 0x0c, 0x96, 0x77, 0x7e, - 0x65, 0xb9, 0xf1, 0x09, 0xc5, 0x6e, 0xc6, 0x84, - 0x18, 0xf0, 0x7d, 0xec, 0x3a, 0xdc, 0x4d, 0x20, - 0x79, 0xee, 0x5f, 0x3e, 0xd7, 0xcb, 0x39, 0x48 -}; - -static const u32 ck[] = { - 0x00070e15, 0x1c232a31, 0x383f464d, 0x545b6269, - 0x70777e85, 0x8c939aa1, 0xa8afb6bd, 0xc4cbd2d9, - 0xe0e7eef5, 0xfc030a11, 0x181f262d, 0x343b4249, - 0x50575e65, 0x6c737a81, 0x888f969d, 0xa4abb2b9, - 0xc0c7ced5, 0xdce3eaf1, 0xf8ff060d, 0x141b2229, - 0x30373e45, 0x4c535a61, 0x686f767d, 0x848b9299, - 0xa0a7aeb5, 0xbcc3cad1, 0xd8dfe6ed, 0xf4fb0209, - 0x10171e25, 0x2c333a41, 0x484f565d, 0x646b7279 -}; - -static u32 sm4_t_non_lin_sub(u32 x) -{ - int i; - u8 *b = (u8 *)&x; - - for (i = 0; i < 4; ++i) - b[i] = sbox[b[i]]; - - return x; -} - -static u32 sm4_key_lin_sub(u32 x) -{ - return x ^ rol32(x, 13) ^ rol32(x, 23); - -} - -static u32 sm4_enc_lin_sub(u32 x) -{ - return x ^ rol32(x, 2) ^ rol32(x, 10) ^ rol32(x, 18) ^ rol32(x, 24); -} - -static u32 sm4_key_sub(u32 x) -{ - return sm4_key_lin_sub(sm4_t_non_lin_sub(x)); -} - -static u32 sm4_enc_sub(u32 x) -{ - return sm4_enc_lin_sub(sm4_t_non_lin_sub(x)); -} - -static u32 sm4_round(const u32 *x, const u32 rk) -{ - return x[0] ^ sm4_enc_sub(x[1] ^ x[2] ^ x[3] ^ rk); -} - - -/** - * crypto_sm4_expand_key - Expands the SM4 key as described in GB/T 32907-2016 - * @ctx: The location where the computed key will be stored. - * @in_key: The supplied key. - * @key_len: The length of the supplied key. - * - * Returns 0 on success. The function fails only if an invalid key size (or - * pointer) is supplied. - */ -int crypto_sm4_expand_key(struct crypto_sm4_ctx *ctx, const u8 *in_key, - unsigned int key_len) -{ - u32 rk[4], t; - const u32 *key = (u32 *)in_key; - int i; - - if (key_len != SM4_KEY_SIZE) - return -EINVAL; - - for (i = 0; i < 4; ++i) - rk[i] = get_unaligned_be32(&key[i]) ^ fk[i]; - - for (i = 0; i < 32; ++i) { - t = rk[0] ^ sm4_key_sub(rk[1] ^ rk[2] ^ rk[3] ^ ck[i]); - ctx->rkey_enc[i] = t; - rk[0] = rk[1]; - rk[1] = rk[2]; - rk[2] = rk[3]; - rk[3] = t; - } - - for (i = 0; i < 32; ++i) - ctx->rkey_dec[i] = ctx->rkey_enc[31 - i]; - - return 0; -} -EXPORT_SYMBOL_GPL(crypto_sm4_expand_key); - /** * crypto_sm4_set_key - Set the SM4 key. * @tfm: The %crypto_tfm that is used in the context. * @in_key: The input key. * @key_len: The size of the key. * - * This function uses crypto_sm4_expand_key() to expand the key. + * This function uses sm4_expandkey() to expand the key. * &crypto_sm4_ctx _must_ be the private data embedded in @tfm which is * retrieved with crypto_tfm_ctx(). * @@ -159,36 +33,17 @@ int crypto_sm4_set_key(struct crypto_tfm *tfm, const u8 *in_key, { struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm);
- return crypto_sm4_expand_key(ctx, in_key, key_len); + return sm4_expandkey(ctx, in_key, key_len); } EXPORT_SYMBOL_GPL(crypto_sm4_set_key);
-static void sm4_do_crypt(const u32 *rk, u32 *out, const u32 *in) -{ - u32 x[4], i, t; - - for (i = 0; i < 4; ++i) - x[i] = get_unaligned_be32(&in[i]); - - for (i = 0; i < 32; ++i) { - t = sm4_round(x, rk[i]); - x[0] = x[1]; - x[1] = x[2]; - x[2] = x[3]; - x[3] = t; - } - - for (i = 0; i < 4; ++i) - put_unaligned_be32(x[3 - i], &out[i]); -} - /* encrypt a block of text */
void crypto_sm4_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) { const struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm);
- sm4_do_crypt(ctx->rkey_enc, (u32 *)out, (u32 *)in); + sm4_crypt_block(ctx->rkey_enc, out, in); } EXPORT_SYMBOL_GPL(crypto_sm4_encrypt);
@@ -198,7 +53,7 @@ void crypto_sm4_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) { const struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm);
- sm4_do_crypt(ctx->rkey_dec, (u32 *)out, (u32 *)in); + sm4_crypt_block(ctx->rkey_dec, out, in); } EXPORT_SYMBOL_GPL(crypto_sm4_decrypt);
diff --git a/include/crypto/sm4.h b/include/crypto/sm4.h index 7afd730d16ff..06322325f862 100644 --- a/include/crypto/sm4.h +++ b/include/crypto/sm4.h @@ -3,6 +3,7 @@ /* * Common values for the SM4 algorithm * Copyright (C) 2018 ARM Limited or its affiliates. + * Copyright (c) 2021 Tianjia Zhang tianjia.zhang@linux.alibaba.com */
#ifndef _CRYPTO_SM4_H @@ -20,11 +21,28 @@ struct crypto_sm4_ctx { u32 rkey_dec[SM4_RKEY_WORDS]; };
-int crypto_sm4_set_key(struct crypto_tfm *tfm, const u8 *in_key, - unsigned int key_len); -int crypto_sm4_expand_key(struct crypto_sm4_ctx *ctx, const u8 *in_key, +/** + * sm4_expandkey - Expands the SM4 key as described in GB/T 32907-2016 + * @ctx: The location where the computed key will be stored. + * @in_key: The supplied key. + * @key_len: The length of the supplied key. + * + * Returns 0 on success. The function fails only if an invalid key size (or + * pointer) is supplied. + */ +int sm4_expandkey(struct crypto_sm4_ctx *ctx, const u8 *in_key, unsigned int key_len);
+/** + * sm4_crypt_block - Encrypt or decrypt a single SM4 block + * @rk: The rkey_enc for encrypt or rkey_dec for decrypt + * @out: Buffer to store output data + * @in: Buffer containing the input data + */ +void sm4_crypt_block(const u32 *rk, u8 *out, const u8 *in); + +int crypto_sm4_set_key(struct crypto_tfm *tfm, const u8 *in_key, + unsigned int key_len); void crypto_sm4_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in); void crypto_sm4_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in);
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 14c032de276e..545ccbddf6a1 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -128,3 +128,6 @@ config CRYPTO_LIB_CHACHA20POLY1305
config CRYPTO_LIB_SHA256 tristate + +config CRYPTO_LIB_SM4 + tristate diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 3a435629d9ce..73205ed269ba 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -38,6 +38,9 @@ libpoly1305-y += poly1305.o obj-$(CONFIG_CRYPTO_LIB_SHA256) += libsha256.o libsha256-y := sha256.o
+obj-$(CONFIG_CRYPTO_LIB_SM4) += libsm4.o +libsm4-y := sm4.o + ifneq ($(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS),y) libblake2s-y += blake2s-selftest.o libchacha20poly1305-y += chacha20poly1305-selftest.o diff --git a/lib/crypto/sm4.c b/lib/crypto/sm4.c new file mode 100644 index 000000000000..5fbf8c741a2f --- /dev/null +++ b/lib/crypto/sm4.c @@ -0,0 +1,176 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SM4, as specified in + * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html + * + * Copyright (C) 2018 ARM Limited or its affiliates. + * Copyright (c) 2021 Tianjia Zhang tianjia.zhang@linux.alibaba.com + */ + +#include <linux/module.h> +#include <asm/unaligned.h> +#include <crypto/sm4.h> + +static const u32 fk[4] = { + 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc +}; + +static const u32 __cacheline_aligned ck[32] = { + 0x00070e15, 0x1c232a31, 0x383f464d, 0x545b6269, + 0x70777e85, 0x8c939aa1, 0xa8afb6bd, 0xc4cbd2d9, + 0xe0e7eef5, 0xfc030a11, 0x181f262d, 0x343b4249, + 0x50575e65, 0x6c737a81, 0x888f969d, 0xa4abb2b9, + 0xc0c7ced5, 0xdce3eaf1, 0xf8ff060d, 0x141b2229, + 0x30373e45, 0x4c535a61, 0x686f767d, 0x848b9299, + 0xa0a7aeb5, 0xbcc3cad1, 0xd8dfe6ed, 0xf4fb0209, + 0x10171e25, 0x2c333a41, 0x484f565d, 0x646b7279 +}; + +static const u8 __cacheline_aligned sbox[256] = { + 0xd6, 0x90, 0xe9, 0xfe, 0xcc, 0xe1, 0x3d, 0xb7, + 0x16, 0xb6, 0x14, 0xc2, 0x28, 0xfb, 0x2c, 0x05, + 0x2b, 0x67, 0x9a, 0x76, 0x2a, 0xbe, 0x04, 0xc3, + 0xaa, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99, + 0x9c, 0x42, 0x50, 0xf4, 0x91, 0xef, 0x98, 0x7a, + 0x33, 0x54, 0x0b, 0x43, 0xed, 0xcf, 0xac, 0x62, + 0xe4, 0xb3, 0x1c, 0xa9, 0xc9, 0x08, 0xe8, 0x95, + 0x80, 0xdf, 0x94, 0xfa, 0x75, 0x8f, 0x3f, 0xa6, + 0x47, 0x07, 0xa7, 0xfc, 0xf3, 0x73, 0x17, 0xba, + 0x83, 0x59, 0x3c, 0x19, 0xe6, 0x85, 0x4f, 0xa8, + 0x68, 0x6b, 0x81, 0xb2, 0x71, 0x64, 0xda, 0x8b, + 0xf8, 0xeb, 0x0f, 0x4b, 0x70, 0x56, 0x9d, 0x35, + 0x1e, 0x24, 0x0e, 0x5e, 0x63, 0x58, 0xd1, 0xa2, + 0x25, 0x22, 0x7c, 0x3b, 0x01, 0x21, 0x78, 0x87, + 0xd4, 0x00, 0x46, 0x57, 0x9f, 0xd3, 0x27, 0x52, + 0x4c, 0x36, 0x02, 0xe7, 0xa0, 0xc4, 0xc8, 0x9e, + 0xea, 0xbf, 0x8a, 0xd2, 0x40, 0xc7, 0x38, 0xb5, + 0xa3, 0xf7, 0xf2, 0xce, 0xf9, 0x61, 0x15, 0xa1, + 0xe0, 0xae, 0x5d, 0xa4, 0x9b, 0x34, 0x1a, 0x55, + 0xad, 0x93, 0x32, 0x30, 0xf5, 0x8c, 0xb1, 0xe3, + 0x1d, 0xf6, 0xe2, 0x2e, 0x82, 0x66, 0xca, 0x60, + 0xc0, 0x29, 0x23, 0xab, 0x0d, 0x53, 0x4e, 0x6f, + 0xd5, 0xdb, 0x37, 0x45, 0xde, 0xfd, 0x8e, 0x2f, + 0x03, 0xff, 0x6a, 0x72, 0x6d, 0x6c, 0x5b, 0x51, + 0x8d, 0x1b, 0xaf, 0x92, 0xbb, 0xdd, 0xbc, 0x7f, + 0x11, 0xd9, 0x5c, 0x41, 0x1f, 0x10, 0x5a, 0xd8, + 0x0a, 0xc1, 0x31, 0x88, 0xa5, 0xcd, 0x7b, 0xbd, + 0x2d, 0x74, 0xd0, 0x12, 0xb8, 0xe5, 0xb4, 0xb0, + 0x89, 0x69, 0x97, 0x4a, 0x0c, 0x96, 0x77, 0x7e, + 0x65, 0xb9, 0xf1, 0x09, 0xc5, 0x6e, 0xc6, 0x84, + 0x18, 0xf0, 0x7d, 0xec, 0x3a, 0xdc, 0x4d, 0x20, + 0x79, 0xee, 0x5f, 0x3e, 0xd7, 0xcb, 0x39, 0x48 +}; + +static inline u32 sm4_t_non_lin_sub(u32 x) +{ + u32 out; + + out = (u32)sbox[x & 0xff]; + out |= (u32)sbox[(x >> 8) & 0xff] << 8; + out |= (u32)sbox[(x >> 16) & 0xff] << 16; + out |= (u32)sbox[(x >> 24) & 0xff] << 24; + + return out; +} + +static inline u32 sm4_key_lin_sub(u32 x) +{ + return x ^ rol32(x, 13) ^ rol32(x, 23); +} + +static inline u32 sm4_enc_lin_sub(u32 x) +{ + return x ^ rol32(x, 2) ^ rol32(x, 10) ^ rol32(x, 18) ^ rol32(x, 24); +} + +static inline u32 sm4_key_sub(u32 x) +{ + return sm4_key_lin_sub(sm4_t_non_lin_sub(x)); +} + +static inline u32 sm4_enc_sub(u32 x) +{ + return sm4_enc_lin_sub(sm4_t_non_lin_sub(x)); +} + +static inline u32 sm4_round(u32 x0, u32 x1, u32 x2, u32 x3, u32 rk) +{ + return x0 ^ sm4_enc_sub(x1 ^ x2 ^ x3 ^ rk); +} + + +/** + * sm4_expandkey - Expands the SM4 key as described in GB/T 32907-2016 + * @ctx: The location where the computed key will be stored. + * @in_key: The supplied key. + * @key_len: The length of the supplied key. + * + * Returns 0 on success. The function fails only if an invalid key size (or + * pointer) is supplied. + */ +int sm4_expandkey(struct crypto_sm4_ctx *ctx, const u8 *in_key, + unsigned int key_len) +{ + u32 rk[4]; + const u32 *key = (u32 *)in_key; + int i; + + if (key_len != SM4_KEY_SIZE) + return -EINVAL; + + rk[0] = get_unaligned_be32(&key[0]) ^ fk[0]; + rk[1] = get_unaligned_be32(&key[1]) ^ fk[1]; + rk[2] = get_unaligned_be32(&key[2]) ^ fk[2]; + rk[3] = get_unaligned_be32(&key[3]) ^ fk[3]; + + for (i = 0; i < 32; i += 4) { + rk[0] ^= sm4_key_sub(rk[1] ^ rk[2] ^ rk[3] ^ ck[i + 0]); + rk[1] ^= sm4_key_sub(rk[2] ^ rk[3] ^ rk[0] ^ ck[i + 1]); + rk[2] ^= sm4_key_sub(rk[3] ^ rk[0] ^ rk[1] ^ ck[i + 2]); + rk[3] ^= sm4_key_sub(rk[0] ^ rk[1] ^ rk[2] ^ ck[i + 3]); + + ctx->rkey_enc[i + 0] = rk[0]; + ctx->rkey_enc[i + 1] = rk[1]; + ctx->rkey_enc[i + 2] = rk[2]; + ctx->rkey_enc[i + 3] = rk[3]; + ctx->rkey_dec[31 - 0 - i] = rk[0]; + ctx->rkey_dec[31 - 1 - i] = rk[1]; + ctx->rkey_dec[31 - 2 - i] = rk[2]; + ctx->rkey_dec[31 - 3 - i] = rk[3]; + } + + return 0; +} +EXPORT_SYMBOL_GPL(sm4_expandkey); + +/** + * sm4_crypt_block - Encrypt or decrypt a single SM4 block + * @rk: The rkey_enc for encrypt or rkey_dec for decrypt + * @out: Buffer to store output data + * @in: Buffer containing the input data + */ +void sm4_crypt_block(const u32 *rk, u8 *out, const u8 *in) +{ + u32 x[4], i; + + x[0] = get_unaligned_be32(in + 0 * 4); + x[1] = get_unaligned_be32(in + 1 * 4); + x[2] = get_unaligned_be32(in + 2 * 4); + x[3] = get_unaligned_be32(in + 3 * 4); + + for (i = 0; i < 32; i += 4) { + x[0] = sm4_round(x[0], x[1], x[2], x[3], rk[i + 0]); + x[1] = sm4_round(x[1], x[2], x[3], x[0], rk[i + 1]); + x[2] = sm4_round(x[2], x[3], x[0], x[1], rk[i + 2]); + x[3] = sm4_round(x[3], x[0], x[1], x[2], rk[i + 3]); + } + + put_unaligned_be32(x[3 - 0], out + 0 * 4); + put_unaligned_be32(x[3 - 1], out + 1 * 4); + put_unaligned_be32(x[3 - 2], out + 2 * 4); + put_unaligned_be32(x[3 - 3], out + 3 * 4); +} +EXPORT_SYMBOL_GPL(sm4_crypt_block); + +MODULE_DESCRIPTION("Generic SM4 library"); +MODULE_LICENSE("GPL v2");
From: 沈子俊 shenzijun@kylinos.cn
mainline inclusion from mainline-v5.15.3 commit c59de48e125c6d49a8abd165e388ca57bfe37b17 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4A842?from=project-issue CVE: NA
--------------------------------------------------------------------------
SM4 library is abstracted from sm4-generic algorithm, sm4-ce can depend on the SM4 library instead of sm4-generic, and some functions in sm4-generic do not need to be exported.
Signed-off-by: Tianjia Zhang tianjia.zhang@linux.alibaba.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Signed-off-by: 沈子俊 shenzijun@kylinos.cn Reviewed-by: weiyang wang wangweiyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/crypto/Kconfig | 2 +- arch/arm64/crypto/sm4-ce-glue.c | 20 ++++++++++++++------ crypto/sm4_generic.c | 27 ++++++++++++--------------- include/crypto/sm4.h | 9 ++------- lib/crypto/sm4.c | 2 +- 5 files changed, 30 insertions(+), 30 deletions(-)
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index b8eb0453123d..55f19450091b 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -51,7 +51,7 @@ config CRYPTO_SM4_ARM64_CE tristate "SM4 symmetric cipher (ARMv8.2 Crypto Extensions)" depends on KERNEL_MODE_NEON select CRYPTO_ALGAPI - select CRYPTO_SM4 + select CRYPTO_LIB_SM4
config CRYPTO_GHASH_ARM64_CE tristate "GHASH/AES-GCM using ARMv8 Crypto Extensions" diff --git a/arch/arm64/crypto/sm4-ce-glue.c b/arch/arm64/crypto/sm4-ce-glue.c index 2754c875d39c..9c93cfc4841b 100644 --- a/arch/arm64/crypto/sm4-ce-glue.c +++ b/arch/arm64/crypto/sm4-ce-glue.c @@ -17,12 +17,20 @@ MODULE_LICENSE("GPL v2");
asmlinkage void sm4_ce_do_crypt(const u32 *rk, void *out, const void *in);
+static int sm4_ce_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int key_len) +{ + struct sm4_ctx *ctx = crypto_tfm_ctx(tfm); + + return sm4_expandkey(ctx, key, key_len); +} + static void sm4_ce_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) { - const struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm); + const struct sm4_ctx *ctx = crypto_tfm_ctx(tfm);
if (!crypto_simd_usable()) { - crypto_sm4_encrypt(tfm, out, in); + sm4_crypt_block(ctx->rkey_enc, out, in); } else { kernel_neon_begin(); sm4_ce_do_crypt(ctx->rkey_enc, out, in); @@ -32,10 +40,10 @@ static void sm4_ce_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
static void sm4_ce_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) { - const struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm); + const struct sm4_ctx *ctx = crypto_tfm_ctx(tfm);
if (!crypto_simd_usable()) { - crypto_sm4_decrypt(tfm, out, in); + sm4_crypt_block(ctx->rkey_dec, out, in); } else { kernel_neon_begin(); sm4_ce_do_crypt(ctx->rkey_dec, out, in); @@ -49,12 +57,12 @@ static struct crypto_alg sm4_ce_alg = { .cra_priority = 200, .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = SM4_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypto_sm4_ctx), + .cra_ctxsize = sizeof(struct sm4_ctx), .cra_module = THIS_MODULE, .cra_u.cipher = { .cia_min_keysize = SM4_KEY_SIZE, .cia_max_keysize = SM4_KEY_SIZE, - .cia_setkey = crypto_sm4_set_key, + .cia_setkey = sm4_ce_setkey, .cia_encrypt = sm4_ce_encrypt, .cia_decrypt = sm4_ce_decrypt } diff --git a/crypto/sm4_generic.c b/crypto/sm4_generic.c index d19d01f852a9..4a6480a27fee 100644 --- a/crypto/sm4_generic.c +++ b/crypto/sm4_generic.c @@ -17,45 +17,42 @@ #include <asm/unaligned.h>
/** - * crypto_sm4_set_key - Set the SM4 key. + * sm4_setkey - Set the SM4 key. * @tfm: The %crypto_tfm that is used in the context. * @in_key: The input key. * @key_len: The size of the key. * * This function uses sm4_expandkey() to expand the key. - * &crypto_sm4_ctx _must_ be the private data embedded in @tfm which is + * &sm4_ctx _must_ be the private data embedded in @tfm which is * retrieved with crypto_tfm_ctx(). * * Return: 0 on success; -EINVAL on failure (only happens for bad key lengths) */ -int crypto_sm4_set_key(struct crypto_tfm *tfm, const u8 *in_key, +static int sm4_setkey(struct crypto_tfm *tfm, const u8 *in_key, unsigned int key_len) { - struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm); + struct sm4_ctx *ctx = crypto_tfm_ctx(tfm);
return sm4_expandkey(ctx, in_key, key_len); } -EXPORT_SYMBOL_GPL(crypto_sm4_set_key);
/* encrypt a block of text */
-void crypto_sm4_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) +static void sm4_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) { - const struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm); + const struct sm4_ctx *ctx = crypto_tfm_ctx(tfm);
sm4_crypt_block(ctx->rkey_enc, out, in); } -EXPORT_SYMBOL_GPL(crypto_sm4_encrypt);
/* decrypt a block of text */
-void crypto_sm4_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) +static void sm4_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) { - const struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm); + const struct sm4_ctx *ctx = crypto_tfm_ctx(tfm);
sm4_crypt_block(ctx->rkey_dec, out, in); } -EXPORT_SYMBOL_GPL(crypto_sm4_decrypt);
static struct crypto_alg sm4_alg = { .cra_name = "sm4", @@ -63,15 +60,15 @@ static struct crypto_alg sm4_alg = { .cra_priority = 100, .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = SM4_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypto_sm4_ctx), + .cra_ctxsize = sizeof(struct sm4_ctx), .cra_module = THIS_MODULE, .cra_u = { .cipher = { .cia_min_keysize = SM4_KEY_SIZE, .cia_max_keysize = SM4_KEY_SIZE, - .cia_setkey = crypto_sm4_set_key, - .cia_encrypt = crypto_sm4_encrypt, - .cia_decrypt = crypto_sm4_decrypt + .cia_setkey = sm4_setkey, + .cia_encrypt = sm4_encrypt, + .cia_decrypt = sm4_decrypt } } }; diff --git a/include/crypto/sm4.h b/include/crypto/sm4.h index 06322325f862..709f286e7b25 100644 --- a/include/crypto/sm4.h +++ b/include/crypto/sm4.h @@ -16,7 +16,7 @@ #define SM4_BLOCK_SIZE 16 #define SM4_RKEY_WORDS 32
-struct crypto_sm4_ctx { +struct sm4_ctx { u32 rkey_enc[SM4_RKEY_WORDS]; u32 rkey_dec[SM4_RKEY_WORDS]; }; @@ -30,7 +30,7 @@ struct crypto_sm4_ctx { * Returns 0 on success. The function fails only if an invalid key size (or * pointer) is supplied. */ -int sm4_expandkey(struct crypto_sm4_ctx *ctx, const u8 *in_key, +int sm4_expandkey(struct sm4_ctx *ctx, const u8 *in_key, unsigned int key_len);
/** @@ -41,9 +41,4 @@ int sm4_expandkey(struct crypto_sm4_ctx *ctx, const u8 *in_key, */ void sm4_crypt_block(const u32 *rk, u8 *out, const u8 *in);
-int crypto_sm4_set_key(struct crypto_tfm *tfm, const u8 *in_key, - unsigned int key_len); -void crypto_sm4_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in); -void crypto_sm4_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in); - #endif diff --git a/lib/crypto/sm4.c b/lib/crypto/sm4.c index 5fbf8c741a2f..633b59fed9db 100644 --- a/lib/crypto/sm4.c +++ b/lib/crypto/sm4.c @@ -108,7 +108,7 @@ static inline u32 sm4_round(u32 x0, u32 x1, u32 x2, u32 x3, u32 rk) * Returns 0 on success. The function fails only if an invalid key size (or * pointer) is supplied. */ -int sm4_expandkey(struct crypto_sm4_ctx *ctx, const u8 *in_key, +int sm4_expandkey(struct sm4_ctx *ctx, const u8 *in_key, unsigned int key_len) { u32 rk[4];
From: 沈子俊 shenzijun@kylinos.cn
mainline inclusion from mainline-v5.15.3 commit a7ee22ee1445c7fdb00ab80116bb9710ca86a860 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4A842?from=project-issue CVE: NA
--------------------------------------------------------------------
This patch adds AES-NI/AVX/x86_64 assembler implementation of SM4 block cipher. Through two affine transforms, we can use the AES S-Box to simulate the SM4 S-Box to achieve the effect of instruction acceleration.
The main algorithm implementation comes from SM4 AES-NI work by libgcrypt and Markku-Juhani O. Saarinen at: https://github.com/mjosaarinen/sm4ni
This optimization supports the four modes of SM4, ECB, CBC, CFB, and CTR. Since CBC and CFB do not support multiple block parallel encryption, the optimization effect is not obvious.
Benchmark on Intel Xeon Cascadelake, the data comes from the 218 mode and 518 mode of tcrypt. The abscissas are blocks of different lengths. The data is tabulated and the unit is Mb/s:
sm4-generic | 16 64 128 256 1024 1420 4096 ECB enc | 40.99 46.50 48.05 48.41 49.20 49.25 49.28 ECB dec | 41.07 46.99 48.15 48.67 49.20 49.25 49.29 CBC enc | 37.71 45.28 46.77 47.60 48.32 48.37 48.40 CBC dec | 36.48 44.82 46.43 47.45 48.23 48.30 48.36 CFB enc | 37.94 44.84 46.12 46.94 47.57 47.46 47.68 CFB dec | 37.50 42.84 43.74 44.37 44.85 44.80 44.96 CTR enc | 39.20 45.63 46.75 47.49 48.09 47.85 48.08 CTR dec | 39.64 45.70 46.72 47.47 47.98 47.88 48.06 sm4-aesni-avx ECB enc | 33.75 134.47 221.64 243.43 264.05 251.58 258.13 ECB dec | 34.02 134.92 223.11 245.14 264.12 251.04 258.33 CBC enc | 38.85 46.18 47.67 48.34 49.00 48.96 49.14 CBC dec | 33.54 131.29 223.88 245.27 265.50 252.41 263.78 CFB enc | 38.70 46.10 47.58 48.29 49.01 48.94 49.19 CFB dec | 32.79 128.40 223.23 244.87 265.77 253.31 262.79 CTR enc | 32.58 122.23 220.29 241.16 259.57 248.32 256.69 CTR dec | 32.81 122.47 218.99 241.54 258.42 248.58 256.61
Signed-off-by: Tianjia Zhang tianjia.zhang@linux.alibaba.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Signed-off-by: 沈子俊 shenzijun@kylinos.cn Reviewed-by: weiyang wang wangweiyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/crypto/Makefile | 3 + arch/x86/crypto/sm4-aesni-avx-asm_64.S | 589 +++++++++++++++++++++++++ arch/x86/crypto/sm4_aesni_avx_glue.c | 459 +++++++++++++++++++ crypto/Kconfig | 21 + 4 files changed, 1072 insertions(+) create mode 100644 arch/x86/crypto/sm4-aesni-avx-asm_64.S create mode 100644 arch/x86/crypto/sm4_aesni_avx_glue.c
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index a31de0c6ccde..5535810beef5 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -92,6 +92,9 @@ nhpoly1305-avx2-y := nh-avx2-x86_64.o nhpoly1305-avx2-glue.o
obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o
+obj-$(CONFIG_CRYPTO_SM4_AESNI_AVX_X86_64) += sm4-aesni-avx-x86_64.o +sm4-aesni-avx-x86_64-y := sm4-aesni-avx-asm_64.o sm4_aesni_avx_glue.o + quiet_cmd_perlasm = PERLASM $@ cmd_perlasm = $(PERL) $< > $@ $(obj)/%.S: $(src)/%.pl FORCE diff --git a/arch/x86/crypto/sm4-aesni-avx-asm_64.S b/arch/x86/crypto/sm4-aesni-avx-asm_64.S new file mode 100644 index 000000000000..fa2c3f50aecb --- /dev/null +++ b/arch/x86/crypto/sm4-aesni-avx-asm_64.S @@ -0,0 +1,589 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SM4 Cipher Algorithm, AES-NI/AVX optimized. + * as specified in + * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html + * + * Copyright (C) 2018 Markku-Juhani O. Saarinen mjos@iki.fi + * Copyright (C) 2020 Jussi Kivilinna jussi.kivilinna@iki.fi + * Copyright (c) 2021 Tianjia Zhang tianjia.zhang@linux.alibaba.com + */ + +/* Based on SM4 AES-NI work by libgcrypt and Markku-Juhani O. Saarinen at: + * https://github.com/mjosaarinen/sm4ni + */ + +#include <linux/linkage.h> +#include <asm/frame.h> + +#define rRIP (%rip) + +#define RX0 %xmm0 +#define RX1 %xmm1 +#define MASK_4BIT %xmm2 +#define RTMP0 %xmm3 +#define RTMP1 %xmm4 +#define RTMP2 %xmm5 +#define RTMP3 %xmm6 +#define RTMP4 %xmm7 + +#define RA0 %xmm8 +#define RA1 %xmm9 +#define RA2 %xmm10 +#define RA3 %xmm11 + +#define RB0 %xmm12 +#define RB1 %xmm13 +#define RB2 %xmm14 +#define RB3 %xmm15 + +#define RNOT %xmm0 +#define RBSWAP %xmm1 + + +/* Transpose four 32-bit words between 128-bit vectors. */ +#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ + vpunpckhdq x1, x0, t2; \ + vpunpckldq x1, x0, x0; \ + \ + vpunpckldq x3, x2, t1; \ + vpunpckhdq x3, x2, x2; \ + \ + vpunpckhqdq t1, x0, x1; \ + vpunpcklqdq t1, x0, x0; \ + \ + vpunpckhqdq x2, t2, x3; \ + vpunpcklqdq x2, t2, x2; + +/* pre-SubByte transform. */ +#define transform_pre(x, lo_t, hi_t, mask4bit, tmp0) \ + vpand x, mask4bit, tmp0; \ + vpandn x, mask4bit, x; \ + vpsrld $4, x, x; \ + \ + vpshufb tmp0, lo_t, tmp0; \ + vpshufb x, hi_t, x; \ + vpxor tmp0, x, x; + +/* post-SubByte transform. Note: x has been XOR'ed with mask4bit by + * 'vaeslastenc' instruction. + */ +#define transform_post(x, lo_t, hi_t, mask4bit, tmp0) \ + vpandn mask4bit, x, tmp0; \ + vpsrld $4, x, x; \ + vpand x, mask4bit, x; \ + \ + vpshufb tmp0, lo_t, tmp0; \ + vpshufb x, hi_t, x; \ + vpxor tmp0, x, x; + + +.section .rodata.cst164, "aM", @progbits, 164 +.align 16 + +/* + * Following four affine transform look-up tables are from work by + * Markku-Juhani O. Saarinen, at https://github.com/mjosaarinen/sm4ni + * + * These allow exposing SM4 S-Box from AES SubByte. + */ + +/* pre-SubByte affine transform, from SM4 field to AES field. */ +.Lpre_tf_lo_s: + .quad 0x9197E2E474720701, 0xC7C1B4B222245157 +.Lpre_tf_hi_s: + .quad 0xE240AB09EB49A200, 0xF052B91BF95BB012 + +/* post-SubByte affine transform, from AES field to SM4 field. */ +.Lpost_tf_lo_s: + .quad 0x5B67F2CEA19D0834, 0xEDD14478172BBE82 +.Lpost_tf_hi_s: + .quad 0xAE7201DD73AFDC00, 0x11CDBE62CC1063BF + +/* For isolating SubBytes from AESENCLAST, inverse shift row */ +.Linv_shift_row: + .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b + .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 + +/* Inverse shift row + Rotate left by 8 bits on 32-bit words with vpshufb */ +.Linv_shift_row_rol_8: + .byte 0x07, 0x00, 0x0d, 0x0a, 0x0b, 0x04, 0x01, 0x0e + .byte 0x0f, 0x08, 0x05, 0x02, 0x03, 0x0c, 0x09, 0x06 + +/* Inverse shift row + Rotate left by 16 bits on 32-bit words with vpshufb */ +.Linv_shift_row_rol_16: + .byte 0x0a, 0x07, 0x00, 0x0d, 0x0e, 0x0b, 0x04, 0x01 + .byte 0x02, 0x0f, 0x08, 0x05, 0x06, 0x03, 0x0c, 0x09 + +/* Inverse shift row + Rotate left by 24 bits on 32-bit words with vpshufb */ +.Linv_shift_row_rol_24: + .byte 0x0d, 0x0a, 0x07, 0x00, 0x01, 0x0e, 0x0b, 0x04 + .byte 0x05, 0x02, 0x0f, 0x08, 0x09, 0x06, 0x03, 0x0c + +/* For CTR-mode IV byteswap */ +.Lbswap128_mask: + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +/* For input word byte-swap */ +.Lbswap32_mask: + .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 + +.align 4 +/* 4-bit mask */ +.L0f0f0f0f: + .long 0x0f0f0f0f + + +.text +.align 16 + +/* + * void sm4_aesni_avx_crypt4(const u32 *rk, u8 *dst, + * const u8 *src, int nblocks) + */ +.align 8 +SYM_FUNC_START(sm4_aesni_avx_crypt4) + /* input: + * %rdi: round key array, CTX + * %rsi: dst (1..4 blocks) + * %rdx: src (1..4 blocks) + * %rcx: num blocks (1..4) + */ + FRAME_BEGIN + + vmovdqu 0*16(%rdx), RA0; + vmovdqa RA0, RA1; + vmovdqa RA0, RA2; + vmovdqa RA0, RA3; + cmpq $2, %rcx; + jb .Lblk4_load_input_done; + vmovdqu 1*16(%rdx), RA1; + je .Lblk4_load_input_done; + vmovdqu 2*16(%rdx), RA2; + cmpq $3, %rcx; + je .Lblk4_load_input_done; + vmovdqu 3*16(%rdx), RA3; + +.Lblk4_load_input_done: + + vmovdqa .Lbswap32_mask rRIP, RTMP2; + vpshufb RTMP2, RA0, RA0; + vpshufb RTMP2, RA1, RA1; + vpshufb RTMP2, RA2, RA2; + vpshufb RTMP2, RA3, RA3; + + vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT; + vmovdqa .Lpre_tf_lo_s rRIP, RTMP4; + vmovdqa .Lpre_tf_hi_s rRIP, RB0; + vmovdqa .Lpost_tf_lo_s rRIP, RB1; + vmovdqa .Lpost_tf_hi_s rRIP, RB2; + vmovdqa .Linv_shift_row rRIP, RB3; + vmovdqa .Linv_shift_row_rol_8 rRIP, RTMP2; + vmovdqa .Linv_shift_row_rol_16 rRIP, RTMP3; + transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); + +#define ROUND(round, s0, s1, s2, s3) \ + vbroadcastss (4*(round))(%rdi), RX0; \ + vpxor s1, RX0, RX0; \ + vpxor s2, RX0, RX0; \ + vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \ + \ + /* sbox, non-linear part */ \ + transform_pre(RX0, RTMP4, RB0, MASK_4BIT, RTMP0); \ + vaesenclast MASK_4BIT, RX0, RX0; \ + transform_post(RX0, RB1, RB2, MASK_4BIT, RTMP0); \ + \ + /* linear part */ \ + vpshufb RB3, RX0, RTMP0; \ + vpxor RTMP0, s0, s0; /* s0 ^ x */ \ + vpshufb RTMP2, RX0, RTMP1; \ + vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */ \ + vpshufb RTMP3, RX0, RTMP1; \ + vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \ + vpshufb .Linv_shift_row_rol_24 rRIP, RX0, RTMP1; \ + vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \ + vpslld $2, RTMP0, RTMP1; \ + vpsrld $30, RTMP0, RTMP0; \ + vpxor RTMP0, s0, s0; \ + /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ + vpxor RTMP1, s0, s0; + + leaq (32*4)(%rdi), %rax; +.align 16 +.Lroundloop_blk4: + ROUND(0, RA0, RA1, RA2, RA3); + ROUND(1, RA1, RA2, RA3, RA0); + ROUND(2, RA2, RA3, RA0, RA1); + ROUND(3, RA3, RA0, RA1, RA2); + leaq (4*4)(%rdi), %rdi; + cmpq %rax, %rdi; + jne .Lroundloop_blk4; + +#undef ROUND + + vmovdqa .Lbswap128_mask rRIP, RTMP2; + + transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); + vpshufb RTMP2, RA0, RA0; + vpshufb RTMP2, RA1, RA1; + vpshufb RTMP2, RA2, RA2; + vpshufb RTMP2, RA3, RA3; + + vmovdqu RA0, 0*16(%rsi); + cmpq $2, %rcx; + jb .Lblk4_store_output_done; + vmovdqu RA1, 1*16(%rsi); + je .Lblk4_store_output_done; + vmovdqu RA2, 2*16(%rsi); + cmpq $3, %rcx; + je .Lblk4_store_output_done; + vmovdqu RA3, 3*16(%rsi); + +.Lblk4_store_output_done: + vzeroall; + FRAME_END + ret; +SYM_FUNC_END(sm4_aesni_avx_crypt4) + +.align 8 +SYM_FUNC_START_LOCAL(__sm4_crypt_blk8) + /* input: + * %rdi: round key array, CTX + * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel + * plaintext blocks + * output: + * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel + * ciphertext blocks + */ + FRAME_BEGIN + + vmovdqa .Lbswap32_mask rRIP, RTMP2; + vpshufb RTMP2, RA0, RA0; + vpshufb RTMP2, RA1, RA1; + vpshufb RTMP2, RA2, RA2; + vpshufb RTMP2, RA3, RA3; + vpshufb RTMP2, RB0, RB0; + vpshufb RTMP2, RB1, RB1; + vpshufb RTMP2, RB2, RB2; + vpshufb RTMP2, RB3, RB3; + + vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT; + transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); + transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1); + +#define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3) \ + vbroadcastss (4*(round))(%rdi), RX0; \ + vmovdqa .Lpre_tf_lo_s rRIP, RTMP4; \ + vmovdqa .Lpre_tf_hi_s rRIP, RTMP1; \ + vmovdqa RX0, RX1; \ + vpxor s1, RX0, RX0; \ + vpxor s2, RX0, RX0; \ + vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \ + vmovdqa .Lpost_tf_lo_s rRIP, RTMP2; \ + vmovdqa .Lpost_tf_hi_s rRIP, RTMP3; \ + vpxor r1, RX1, RX1; \ + vpxor r2, RX1, RX1; \ + vpxor r3, RX1, RX1; /* r1 ^ r2 ^ r3 ^ rk */ \ + \ + /* sbox, non-linear part */ \ + transform_pre(RX0, RTMP4, RTMP1, MASK_4BIT, RTMP0); \ + transform_pre(RX1, RTMP4, RTMP1, MASK_4BIT, RTMP0); \ + vmovdqa .Linv_shift_row rRIP, RTMP4; \ + vaesenclast MASK_4BIT, RX0, RX0; \ + vaesenclast MASK_4BIT, RX1, RX1; \ + transform_post(RX0, RTMP2, RTMP3, MASK_4BIT, RTMP0); \ + transform_post(RX1, RTMP2, RTMP3, MASK_4BIT, RTMP0); \ + \ + /* linear part */ \ + vpshufb RTMP4, RX0, RTMP0; \ + vpxor RTMP0, s0, s0; /* s0 ^ x */ \ + vpshufb RTMP4, RX1, RTMP2; \ + vmovdqa .Linv_shift_row_rol_8 rRIP, RTMP4; \ + vpxor RTMP2, r0, r0; /* r0 ^ x */ \ + vpshufb RTMP4, RX0, RTMP1; \ + vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */ \ + vpshufb RTMP4, RX1, RTMP3; \ + vmovdqa .Linv_shift_row_rol_16 rRIP, RTMP4; \ + vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) */ \ + vpshufb RTMP4, RX0, RTMP1; \ + vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \ + vpshufb RTMP4, RX1, RTMP3; \ + vmovdqa .Linv_shift_row_rol_24 rRIP, RTMP4; \ + vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) ^ rol(x,16) */ \ + vpshufb RTMP4, RX0, RTMP1; \ + vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \ + /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ + vpslld $2, RTMP0, RTMP1; \ + vpsrld $30, RTMP0, RTMP0; \ + vpxor RTMP0, s0, s0; \ + vpxor RTMP1, s0, s0; \ + vpshufb RTMP4, RX1, RTMP3; \ + vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x,24) */ \ + /* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ + vpslld $2, RTMP2, RTMP3; \ + vpsrld $30, RTMP2, RTMP2; \ + vpxor RTMP2, r0, r0; \ + vpxor RTMP3, r0, r0; + + leaq (32*4)(%rdi), %rax; +.align 16 +.Lroundloop_blk8: + ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3); + ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, RB3, RB0); + ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, RB0, RB1); + ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, RB1, RB2); + leaq (4*4)(%rdi), %rdi; + cmpq %rax, %rdi; + jne .Lroundloop_blk8; + +#undef ROUND + + vmovdqa .Lbswap128_mask rRIP, RTMP2; + + transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); + transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1); + vpshufb RTMP2, RA0, RA0; + vpshufb RTMP2, RA1, RA1; + vpshufb RTMP2, RA2, RA2; + vpshufb RTMP2, RA3, RA3; + vpshufb RTMP2, RB0, RB0; + vpshufb RTMP2, RB1, RB1; + vpshufb RTMP2, RB2, RB2; + vpshufb RTMP2, RB3, RB3; + + FRAME_END + ret; +SYM_FUNC_END(__sm4_crypt_blk8) + +/* + * void sm4_aesni_avx_crypt8(const u32 *rk, u8 *dst, + * const u8 *src, int nblocks) + */ +.align 8 +SYM_FUNC_START(sm4_aesni_avx_crypt8) + /* input: + * %rdi: round key array, CTX + * %rsi: dst (1..8 blocks) + * %rdx: src (1..8 blocks) + * %rcx: num blocks (1..8) + */ + FRAME_BEGIN + + cmpq $5, %rcx; + jb sm4_aesni_avx_crypt4; + vmovdqu (0 * 16)(%rdx), RA0; + vmovdqu (1 * 16)(%rdx), RA1; + vmovdqu (2 * 16)(%rdx), RA2; + vmovdqu (3 * 16)(%rdx), RA3; + vmovdqu (4 * 16)(%rdx), RB0; + vmovdqa RB0, RB1; + vmovdqa RB0, RB2; + vmovdqa RB0, RB3; + je .Lblk8_load_input_done; + vmovdqu (5 * 16)(%rdx), RB1; + cmpq $7, %rcx; + jb .Lblk8_load_input_done; + vmovdqu (6 * 16)(%rdx), RB2; + je .Lblk8_load_input_done; + vmovdqu (7 * 16)(%rdx), RB3; + +.Lblk8_load_input_done: + call __sm4_crypt_blk8; + + cmpq $6, %rcx; + vmovdqu RA0, (0 * 16)(%rsi); + vmovdqu RA1, (1 * 16)(%rsi); + vmovdqu RA2, (2 * 16)(%rsi); + vmovdqu RA3, (3 * 16)(%rsi); + vmovdqu RB0, (4 * 16)(%rsi); + jb .Lblk8_store_output_done; + vmovdqu RB1, (5 * 16)(%rsi); + je .Lblk8_store_output_done; + vmovdqu RB2, (6 * 16)(%rsi); + cmpq $7, %rcx; + je .Lblk8_store_output_done; + vmovdqu RB3, (7 * 16)(%rsi); + +.Lblk8_store_output_done: + vzeroall; + FRAME_END + ret; +SYM_FUNC_END(sm4_aesni_avx_crypt8) + +/* + * void sm4_aesni_avx_ctr_enc_blk8(const u32 *rk, u8 *dst, + * const u8 *src, u8 *iv) + */ +.align 8 +SYM_FUNC_START(sm4_aesni_avx_ctr_enc_blk8) + /* input: + * %rdi: round key array, CTX + * %rsi: dst (8 blocks) + * %rdx: src (8 blocks) + * %rcx: iv (big endian, 128bit) + */ + FRAME_BEGIN + + /* load IV and byteswap */ + vmovdqu (%rcx), RA0; + + vmovdqa .Lbswap128_mask rRIP, RBSWAP; + vpshufb RBSWAP, RA0, RTMP0; /* be => le */ + + vpcmpeqd RNOT, RNOT, RNOT; + vpsrldq $8, RNOT, RNOT; /* low: -1, high: 0 */ + +#define inc_le128(x, minus_one, tmp) \ + vpcmpeqq minus_one, x, tmp; \ + vpsubq minus_one, x, x; \ + vpslldq $8, tmp, tmp; \ + vpsubq tmp, x, x; + + /* construct IVs */ + inc_le128(RTMP0, RNOT, RTMP2); /* +1 */ + vpshufb RBSWAP, RTMP0, RA1; + inc_le128(RTMP0, RNOT, RTMP2); /* +2 */ + vpshufb RBSWAP, RTMP0, RA2; + inc_le128(RTMP0, RNOT, RTMP2); /* +3 */ + vpshufb RBSWAP, RTMP0, RA3; + inc_le128(RTMP0, RNOT, RTMP2); /* +4 */ + vpshufb RBSWAP, RTMP0, RB0; + inc_le128(RTMP0, RNOT, RTMP2); /* +5 */ + vpshufb RBSWAP, RTMP0, RB1; + inc_le128(RTMP0, RNOT, RTMP2); /* +6 */ + vpshufb RBSWAP, RTMP0, RB2; + inc_le128(RTMP0, RNOT, RTMP2); /* +7 */ + vpshufb RBSWAP, RTMP0, RB3; + inc_le128(RTMP0, RNOT, RTMP2); /* +8 */ + vpshufb RBSWAP, RTMP0, RTMP1; + + /* store new IV */ + vmovdqu RTMP1, (%rcx); + + call __sm4_crypt_blk8; + + vpxor (0 * 16)(%rdx), RA0, RA0; + vpxor (1 * 16)(%rdx), RA1, RA1; + vpxor (2 * 16)(%rdx), RA2, RA2; + vpxor (3 * 16)(%rdx), RA3, RA3; + vpxor (4 * 16)(%rdx), RB0, RB0; + vpxor (5 * 16)(%rdx), RB1, RB1; + vpxor (6 * 16)(%rdx), RB2, RB2; + vpxor (7 * 16)(%rdx), RB3, RB3; + + vmovdqu RA0, (0 * 16)(%rsi); + vmovdqu RA1, (1 * 16)(%rsi); + vmovdqu RA2, (2 * 16)(%rsi); + vmovdqu RA3, (3 * 16)(%rsi); + vmovdqu RB0, (4 * 16)(%rsi); + vmovdqu RB1, (5 * 16)(%rsi); + vmovdqu RB2, (6 * 16)(%rsi); + vmovdqu RB3, (7 * 16)(%rsi); + + vzeroall; + FRAME_END + ret; +SYM_FUNC_END(sm4_aesni_avx_ctr_enc_blk8) + +/* + * void sm4_aesni_avx_cbc_dec_blk8(const u32 *rk, u8 *dst, + * const u8 *src, u8 *iv) + */ +.align 8 +SYM_FUNC_START(sm4_aesni_avx_cbc_dec_blk8) + /* input: + * %rdi: round key array, CTX + * %rsi: dst (8 blocks) + * %rdx: src (8 blocks) + * %rcx: iv + */ + FRAME_BEGIN + + vmovdqu (0 * 16)(%rdx), RA0; + vmovdqu (1 * 16)(%rdx), RA1; + vmovdqu (2 * 16)(%rdx), RA2; + vmovdqu (3 * 16)(%rdx), RA3; + vmovdqu (4 * 16)(%rdx), RB0; + vmovdqu (5 * 16)(%rdx), RB1; + vmovdqu (6 * 16)(%rdx), RB2; + vmovdqu (7 * 16)(%rdx), RB3; + + call __sm4_crypt_blk8; + + vmovdqu (7 * 16)(%rdx), RNOT; + vpxor (%rcx), RA0, RA0; + vpxor (0 * 16)(%rdx), RA1, RA1; + vpxor (1 * 16)(%rdx), RA2, RA2; + vpxor (2 * 16)(%rdx), RA3, RA3; + vpxor (3 * 16)(%rdx), RB0, RB0; + vpxor (4 * 16)(%rdx), RB1, RB1; + vpxor (5 * 16)(%rdx), RB2, RB2; + vpxor (6 * 16)(%rdx), RB3, RB3; + vmovdqu RNOT, (%rcx); /* store new IV */ + + vmovdqu RA0, (0 * 16)(%rsi); + vmovdqu RA1, (1 * 16)(%rsi); + vmovdqu RA2, (2 * 16)(%rsi); + vmovdqu RA3, (3 * 16)(%rsi); + vmovdqu RB0, (4 * 16)(%rsi); + vmovdqu RB1, (5 * 16)(%rsi); + vmovdqu RB2, (6 * 16)(%rsi); + vmovdqu RB3, (7 * 16)(%rsi); + + vzeroall; + FRAME_END + ret; +SYM_FUNC_END(sm4_aesni_avx_cbc_dec_blk8) + +/* + * void sm4_aesni_avx_cfb_dec_blk8(const u32 *rk, u8 *dst, + * const u8 *src, u8 *iv) + */ +.align 8 +SYM_FUNC_START(sm4_aesni_avx_cfb_dec_blk8) + /* input: + * %rdi: round key array, CTX + * %rsi: dst (8 blocks) + * %rdx: src (8 blocks) + * %rcx: iv + */ + FRAME_BEGIN + + /* Load input */ + vmovdqu (%rcx), RA0; + vmovdqu 0 * 16(%rdx), RA1; + vmovdqu 1 * 16(%rdx), RA2; + vmovdqu 2 * 16(%rdx), RA3; + vmovdqu 3 * 16(%rdx), RB0; + vmovdqu 4 * 16(%rdx), RB1; + vmovdqu 5 * 16(%rdx), RB2; + vmovdqu 6 * 16(%rdx), RB3; + + /* Update IV */ + vmovdqu 7 * 16(%rdx), RNOT; + vmovdqu RNOT, (%rcx); + + call __sm4_crypt_blk8; + + vpxor (0 * 16)(%rdx), RA0, RA0; + vpxor (1 * 16)(%rdx), RA1, RA1; + vpxor (2 * 16)(%rdx), RA2, RA2; + vpxor (3 * 16)(%rdx), RA3, RA3; + vpxor (4 * 16)(%rdx), RB0, RB0; + vpxor (5 * 16)(%rdx), RB1, RB1; + vpxor (6 * 16)(%rdx), RB2, RB2; + vpxor (7 * 16)(%rdx), RB3, RB3; + + vmovdqu RA0, (0 * 16)(%rsi); + vmovdqu RA1, (1 * 16)(%rsi); + vmovdqu RA2, (2 * 16)(%rsi); + vmovdqu RA3, (3 * 16)(%rsi); + vmovdqu RB0, (4 * 16)(%rsi); + vmovdqu RB1, (5 * 16)(%rsi); + vmovdqu RB2, (6 * 16)(%rsi); + vmovdqu RB3, (7 * 16)(%rsi); + + vzeroall; + FRAME_END + ret; +SYM_FUNC_END(sm4_aesni_avx_cfb_dec_blk8) diff --git a/arch/x86/crypto/sm4_aesni_avx_glue.c b/arch/x86/crypto/sm4_aesni_avx_glue.c new file mode 100644 index 000000000000..c1f5728efd1d --- /dev/null +++ b/arch/x86/crypto/sm4_aesni_avx_glue.c @@ -0,0 +1,459 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SM4 Cipher Algorithm, AES-NI/AVX optimized. + * as specified in + * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html + * + * Copyright (c) 2021, Alibaba Group. + * Copyright (c) 2021 Tianjia Zhang tianjia.zhang@linux.alibaba.com + */ + +#include <linux/module.h> +#include <linux/crypto.h> +#include <linux/kernel.h> +#include <asm/simd.h> +#include <crypto/internal/simd.h> +#include <crypto/internal/skcipher.h> +#include <crypto/sm4.h> + +#define SM4_CRYPT8_BLOCK_SIZE (SM4_BLOCK_SIZE * 8) + +asmlinkage void sm4_aesni_avx_crypt4(const u32 *rk, u8 *dst, + const u8 *src, int nblocks); +asmlinkage void sm4_aesni_avx_crypt8(const u32 *rk, u8 *dst, + const u8 *src, int nblocks); +asmlinkage void sm4_aesni_avx_ctr_enc_blk8(const u32 *rk, u8 *dst, + const u8 *src, u8 *iv); +asmlinkage void sm4_aesni_avx_cbc_dec_blk8(const u32 *rk, u8 *dst, + const u8 *src, u8 *iv); +asmlinkage void sm4_aesni_avx_cfb_dec_blk8(const u32 *rk, u8 *dst, + const u8 *src, u8 *iv); + +static int sm4_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int key_len) +{ + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + + return sm4_expandkey(ctx, key, key_len); +} + +static int ecb_do_crypt(struct skcipher_request *req, const u32 *rkey) +{ + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + + kernel_fpu_begin(); + while (nbytes >= SM4_CRYPT8_BLOCK_SIZE) { + sm4_aesni_avx_crypt8(rkey, dst, src, 8); + dst += SM4_CRYPT8_BLOCK_SIZE; + src += SM4_CRYPT8_BLOCK_SIZE; + nbytes -= SM4_CRYPT8_BLOCK_SIZE; + } + while (nbytes >= SM4_BLOCK_SIZE) { + unsigned int nblocks = min(nbytes >> 4, 4u); + sm4_aesni_avx_crypt4(rkey, dst, src, nblocks); + dst += nblocks * SM4_BLOCK_SIZE; + src += nblocks * SM4_BLOCK_SIZE; + nbytes -= nblocks * SM4_BLOCK_SIZE; + } + kernel_fpu_end(); + + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +static int ecb_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + + return ecb_do_crypt(req, ctx->rkey_enc); +} + +static int ecb_decrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + + return ecb_do_crypt(req, ctx->rkey_dec); +} + +static int cbc_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *iv = walk.iv; + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + + while (nbytes >= SM4_BLOCK_SIZE) { + crypto_xor_cpy(dst, src, iv, SM4_BLOCK_SIZE); + sm4_crypt_block(ctx->rkey_enc, dst, dst); + iv = dst; + src += SM4_BLOCK_SIZE; + dst += SM4_BLOCK_SIZE; + nbytes -= SM4_BLOCK_SIZE; + } + if (iv != walk.iv) + memcpy(walk.iv, iv, SM4_BLOCK_SIZE); + + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +static int cbc_decrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + + kernel_fpu_begin(); + + while (nbytes >= SM4_CRYPT8_BLOCK_SIZE) { + sm4_aesni_avx_cbc_dec_blk8(ctx->rkey_dec, dst, + src, walk.iv); + dst += SM4_CRYPT8_BLOCK_SIZE; + src += SM4_CRYPT8_BLOCK_SIZE; + nbytes -= SM4_CRYPT8_BLOCK_SIZE; + } + + if (nbytes >= SM4_BLOCK_SIZE) { + u8 keystream[SM4_BLOCK_SIZE * 8]; + u8 iv[SM4_BLOCK_SIZE]; + unsigned int nblocks = min(nbytes >> 4, 8u); + int i; + + sm4_aesni_avx_crypt8(ctx->rkey_dec, keystream, + src, nblocks); + + src += ((int)nblocks - 2) * SM4_BLOCK_SIZE; + dst += (nblocks - 1) * SM4_BLOCK_SIZE; + memcpy(iv, src + SM4_BLOCK_SIZE, SM4_BLOCK_SIZE); + + for (i = nblocks - 1; i > 0; i--) { + crypto_xor_cpy(dst, src, + &keystream[i * SM4_BLOCK_SIZE], + SM4_BLOCK_SIZE); + src -= SM4_BLOCK_SIZE; + dst -= SM4_BLOCK_SIZE; + } + crypto_xor_cpy(dst, walk.iv, keystream, SM4_BLOCK_SIZE); + memcpy(walk.iv, iv, SM4_BLOCK_SIZE); + nbytes -= nblocks * SM4_BLOCK_SIZE; + } + + kernel_fpu_end(); + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +static int cfb_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + u8 keystream[SM4_BLOCK_SIZE]; + const u8 *iv = walk.iv; + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + + while (nbytes >= SM4_BLOCK_SIZE) { + sm4_crypt_block(ctx->rkey_enc, keystream, iv); + crypto_xor_cpy(dst, src, keystream, SM4_BLOCK_SIZE); + iv = dst; + src += SM4_BLOCK_SIZE; + dst += SM4_BLOCK_SIZE; + nbytes -= SM4_BLOCK_SIZE; + } + if (iv != walk.iv) + memcpy(walk.iv, iv, SM4_BLOCK_SIZE); + + /* tail */ + if (walk.nbytes == walk.total && nbytes > 0) { + sm4_crypt_block(ctx->rkey_enc, keystream, walk.iv); + crypto_xor_cpy(dst, src, keystream, nbytes); + nbytes = 0; + } + + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +static int cfb_decrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + + kernel_fpu_begin(); + + while (nbytes >= SM4_CRYPT8_BLOCK_SIZE) { + sm4_aesni_avx_cfb_dec_blk8(ctx->rkey_enc, dst, + src, walk.iv); + dst += SM4_CRYPT8_BLOCK_SIZE; + src += SM4_CRYPT8_BLOCK_SIZE; + nbytes -= SM4_CRYPT8_BLOCK_SIZE; + } + + if (nbytes >= SM4_BLOCK_SIZE) { + u8 keystream[SM4_BLOCK_SIZE * 8]; + unsigned int nblocks = min(nbytes >> 4, 8u); + + memcpy(keystream, walk.iv, SM4_BLOCK_SIZE); + if (nblocks > 1) + memcpy(&keystream[SM4_BLOCK_SIZE], src, + (nblocks - 1) * SM4_BLOCK_SIZE); + memcpy(walk.iv, src + (nblocks - 1) * SM4_BLOCK_SIZE, + SM4_BLOCK_SIZE); + + sm4_aesni_avx_crypt8(ctx->rkey_enc, keystream, + keystream, nblocks); + + crypto_xor_cpy(dst, src, keystream, + nblocks * SM4_BLOCK_SIZE); + dst += nblocks * SM4_BLOCK_SIZE; + src += nblocks * SM4_BLOCK_SIZE; + nbytes -= nblocks * SM4_BLOCK_SIZE; + } + + kernel_fpu_end(); + + /* tail */ + if (walk.nbytes == walk.total && nbytes > 0) { + u8 keystream[SM4_BLOCK_SIZE]; + + sm4_crypt_block(ctx->rkey_enc, keystream, walk.iv); + crypto_xor_cpy(dst, src, keystream, nbytes); + nbytes = 0; + } + + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +static int ctr_crypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + + kernel_fpu_begin(); + + while (nbytes >= SM4_CRYPT8_BLOCK_SIZE) { + sm4_aesni_avx_ctr_enc_blk8(ctx->rkey_enc, dst, + src, walk.iv); + dst += SM4_CRYPT8_BLOCK_SIZE; + src += SM4_CRYPT8_BLOCK_SIZE; + nbytes -= SM4_CRYPT8_BLOCK_SIZE; + } + + if (nbytes >= SM4_BLOCK_SIZE) { + u8 keystream[SM4_BLOCK_SIZE * 8]; + unsigned int nblocks = min(nbytes >> 4, 8u); + int i; + + for (i = 0; i < nblocks; i++) { + memcpy(&keystream[i * SM4_BLOCK_SIZE], + walk.iv, SM4_BLOCK_SIZE); + crypto_inc(walk.iv, SM4_BLOCK_SIZE); + } + sm4_aesni_avx_crypt8(ctx->rkey_enc, keystream, + keystream, nblocks); + + crypto_xor_cpy(dst, src, keystream, + nblocks * SM4_BLOCK_SIZE); + dst += nblocks * SM4_BLOCK_SIZE; + src += nblocks * SM4_BLOCK_SIZE; + nbytes -= nblocks * SM4_BLOCK_SIZE; + } + + kernel_fpu_end(); + + /* tail */ + if (walk.nbytes == walk.total && nbytes > 0) { + u8 keystream[SM4_BLOCK_SIZE]; + + memcpy(keystream, walk.iv, SM4_BLOCK_SIZE); + crypto_inc(walk.iv, SM4_BLOCK_SIZE); + + sm4_crypt_block(ctx->rkey_enc, keystream, keystream); + + crypto_xor_cpy(dst, src, keystream, nbytes); + dst += nbytes; + src += nbytes; + nbytes = 0; + } + + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +static struct skcipher_alg sm4_aesni_avx_skciphers[] = { + { + .base = { + .cra_name = "__ecb(sm4)", + .cra_driver_name = "__ecb-sm4-aesni-avx", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = SM4_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .walksize = 8 * SM4_BLOCK_SIZE, + .setkey = sm4_skcipher_setkey, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, { + .base = { + .cra_name = "__cbc(sm4)", + .cra_driver_name = "__cbc-sm4-aesni-avx", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = SM4_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .ivsize = SM4_BLOCK_SIZE, + .walksize = 8 * SM4_BLOCK_SIZE, + .setkey = sm4_skcipher_setkey, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, { + .base = { + .cra_name = "__cfb(sm4)", + .cra_driver_name = "__cfb-sm4-aesni-avx", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .ivsize = SM4_BLOCK_SIZE, + .chunksize = SM4_BLOCK_SIZE, + .walksize = 8 * SM4_BLOCK_SIZE, + .setkey = sm4_skcipher_setkey, + .encrypt = cfb_encrypt, + .decrypt = cfb_decrypt, + }, { + .base = { + .cra_name = "__ctr(sm4)", + .cra_driver_name = "__ctr-sm4-aesni-avx", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .ivsize = SM4_BLOCK_SIZE, + .chunksize = SM4_BLOCK_SIZE, + .walksize = 8 * SM4_BLOCK_SIZE, + .setkey = sm4_skcipher_setkey, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + } +}; + +static struct simd_skcipher_alg * +simd_sm4_aesni_avx_skciphers[ARRAY_SIZE(sm4_aesni_avx_skciphers)]; + +static int __init sm4_init(void) +{ + const char *feature_name; + + if (!boot_cpu_has(X86_FEATURE_AVX) || + !boot_cpu_has(X86_FEATURE_AES) || + !boot_cpu_has(X86_FEATURE_OSXSAVE)) { + pr_info("AVX or AES-NI instructions are not detected.\n"); + return -ENODEV; + } + + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, + &feature_name)) { + pr_info("CPU feature '%s' is not supported.\n", feature_name); + return -ENODEV; + } + + return simd_register_skciphers_compat(sm4_aesni_avx_skciphers, + ARRAY_SIZE(sm4_aesni_avx_skciphers), + simd_sm4_aesni_avx_skciphers); +} + +static void __exit sm4_exit(void) +{ + simd_unregister_skciphers(sm4_aesni_avx_skciphers, + ARRAY_SIZE(sm4_aesni_avx_skciphers), + simd_sm4_aesni_avx_skciphers); +} + +module_init(sm4_init); +module_exit(sm4_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Tianjia Zhang tianjia.zhang@linux.alibaba.com"); +MODULE_DESCRIPTION("SM4 Cipher Algorithm, AES-NI/AVX optimized"); +MODULE_ALIAS_CRYPTO("sm4"); +MODULE_ALIAS_CRYPTO("sm4-aesni-avx"); diff --git a/crypto/Kconfig b/crypto/Kconfig index b6b30ac9d75d..9f4ea9c26652 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1642,6 +1642,27 @@ config CRYPTO_SM4
If unsure, say N.
+config CRYPTO_SM4_AESNI_AVX_X86_64 + tristate "SM4 cipher algorithm (x86_64/AES-NI/AVX)" + depends on X86 && 64BIT + select CRYPTO_SKCIPHER + select CRYPTO_SIMD + select CRYPTO_ALGAPI + select CRYPTO_LIB_SM4 + help + SM4 cipher algorithms (OSCCA GB/T 32907-2016) (x86_64/AES-NI/AVX). + + SM4 (GBT.32907-2016) is a cryptographic standard issued by the + Organization of State Commercial Administration of China (OSCCA) + as an authorized cryptographic algorithms for the use within China. + + This is SM4 optimized implementation using AES-NI/AVX/x86_64 + instruction set for block cipher. Through two affine transforms, + we can use the AES S-Box to simulate the SM4 S-Box to achieve the + effect of instruction acceleration. + + If unsure, say N. + config CRYPTO_TEA tristate "TEA, XTEA and XETA cipher algorithms" depends on CRYPTO_USER_API_ENABLE_OBSOLETE
From: 沈子俊 shenzijun@kylinos.cn
mainline inclusion from mainline-v5.15.3 commit a7fc80bb22eb0f13791ee4f70484e88316cc2a24 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4A842?from=project-issue CVE: NA
-----------------------------------------------------------------
tcrypt supports testing of SM4 cipher algorithms that use avx instruction set acceleration. The implementation of sm4 instruction set acceleration supports up to 8 blocks in parallel encryption and decryption, which is 128 bytes. Therefore, the 128-byte block size is also added to block_sizes.
Signed-off-by: Tianjia Zhang tianjia.zhang@linux.alibaba.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Signed-off-by: 沈子俊 shenzijun@kylinos.cn Reviewed-by: weiyang wang wangweiyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- crypto/tcrypt.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-)
diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index 5f9395a28c97..41b3443f56a0 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -77,7 +77,7 @@ static const char *check[] = { NULL };
-static u32 block_sizes[] = { 16, 64, 256, 1024, 1472, 8192, 0 }; +static u32 block_sizes[] = { 16, 64, 128, 256, 1024, 1472, 8192, 0 }; static u32 aead_sizes[] = { 16, 64, 256, 512, 1024, 2048, 4096, 8192, 0 };
#define XBUFSIZE 8 @@ -2075,6 +2075,7 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb) case 191: ret += tcrypt_test("ecb(sm4)"); ret += tcrypt_test("cbc(sm4)"); + ret += tcrypt_test("cfb(sm4)"); ret += tcrypt_test("ctr(sm4)"); break; case 200: @@ -2338,6 +2339,10 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb) speed_template_16); test_cipher_speed("cbc(sm4)", DECRYPT, sec, NULL, 0, speed_template_16); + test_cipher_speed("cfb(sm4)", ENCRYPT, sec, NULL, 0, + speed_template_16); + test_cipher_speed("cfb(sm4)", DECRYPT, sec, NULL, 0, + speed_template_16); test_cipher_speed("ctr(sm4)", ENCRYPT, sec, NULL, 0, speed_template_16); test_cipher_speed("ctr(sm4)", DECRYPT, sec, NULL, 0, @@ -2858,6 +2863,25 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb) speed_template_8_32); break;
+ case 518: + test_acipher_speed("ecb(sm4)", ENCRYPT, sec, NULL, 0, + speed_template_16); + test_acipher_speed("ecb(sm4)", DECRYPT, sec, NULL, 0, + speed_template_16); + test_acipher_speed("cbc(sm4)", ENCRYPT, sec, NULL, 0, + speed_template_16); + test_acipher_speed("cbc(sm4)", DECRYPT, sec, NULL, 0, + speed_template_16); + test_acipher_speed("cfb(sm4)", ENCRYPT, sec, NULL, 0, + speed_template_16); + test_acipher_speed("cfb(sm4)", DECRYPT, sec, NULL, 0, + speed_template_16); + test_acipher_speed("ctr(sm4)", ENCRYPT, sec, NULL, 0, + speed_template_16); + test_acipher_speed("ctr(sm4)", DECRYPT, sec, NULL, 0, + speed_template_16); + break; + case 600: test_mb_skcipher_speed("ecb(aes)", ENCRYPT, sec, NULL, 0, speed_template_16_24_32, num_mb);
From: 沈子俊 shenzijun@kylinos.cn
mainline inclusion from mainline-v5.15.4 commit de79d9aae493a29d02926f396a4fd1a1309436fc category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4A82K?from=project-issue CVE: NA
-------------------------------------------------------------------
Export the reusable functions in the SM4 AESNI/AVX implementation, mainly public functions, which are used to develop the SM4 AESNI/AVX2 implementation, and eliminate unnecessary duplication of code.
At the same time, in order to make the public function universal, minor fixes was added.
Signed-off-by: Tianjia Zhang tianjia.zhang@linux.alibaba.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Signed-off-by: 沈子俊 shenzijun@kylinos.cn Reviewed-by: weiyang wang wangweiyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/crypto/sm4-avx.h | 24 ++++++++ arch/x86/crypto/sm4_aesni_avx_glue.c | 92 ++++++++++++++++++---------- 2 files changed, 84 insertions(+), 32 deletions(-) create mode 100644 arch/x86/crypto/sm4-avx.h
diff --git a/arch/x86/crypto/sm4-avx.h b/arch/x86/crypto/sm4-avx.h new file mode 100644 index 000000000000..1bceab7516aa --- /dev/null +++ b/arch/x86/crypto/sm4-avx.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef ASM_X86_SM4_AVX_H +#define ASM_X86_SM4_AVX_H + +#include <linux/types.h> +#include <crypto/sm4.h> + +typedef void (*sm4_crypt_func)(const u32 *rk, u8 *dst, const u8 *src, u8 *iv); + +int sm4_avx_ecb_encrypt(struct skcipher_request *req); +int sm4_avx_ecb_decrypt(struct skcipher_request *req); + +int sm4_cbc_encrypt(struct skcipher_request *req); +int sm4_avx_cbc_decrypt(struct skcipher_request *req, + unsigned int bsize, sm4_crypt_func func); + +int sm4_cfb_encrypt(struct skcipher_request *req); +int sm4_avx_cfb_decrypt(struct skcipher_request *req, + unsigned int bsize, sm4_crypt_func func); + +int sm4_avx_ctr_crypt(struct skcipher_request *req, + unsigned int bsize, sm4_crypt_func func); + +#endif diff --git a/arch/x86/crypto/sm4_aesni_avx_glue.c b/arch/x86/crypto/sm4_aesni_avx_glue.c index c1f5728efd1d..7800f77d68ad 100644 --- a/arch/x86/crypto/sm4_aesni_avx_glue.c +++ b/arch/x86/crypto/sm4_aesni_avx_glue.c @@ -15,6 +15,7 @@ #include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <crypto/sm4.h> +#include "sm4-avx.h"
#define SM4_CRYPT8_BLOCK_SIZE (SM4_BLOCK_SIZE * 8)
@@ -71,23 +72,25 @@ static int ecb_do_crypt(struct skcipher_request *req, const u32 *rkey) return err; }
-static int ecb_encrypt(struct skcipher_request *req) +int sm4_avx_ecb_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
return ecb_do_crypt(req, ctx->rkey_enc); } +EXPORT_SYMBOL_GPL(sm4_avx_ecb_encrypt);
-static int ecb_decrypt(struct skcipher_request *req) +int sm4_avx_ecb_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
return ecb_do_crypt(req, ctx->rkey_dec); } +EXPORT_SYMBOL_GPL(sm4_avx_ecb_decrypt);
-static int cbc_encrypt(struct skcipher_request *req) +int sm4_cbc_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); @@ -118,8 +121,10 @@ static int cbc_encrypt(struct skcipher_request *req)
return err; } +EXPORT_SYMBOL_GPL(sm4_cbc_encrypt);
-static int cbc_decrypt(struct skcipher_request *req) +int sm4_avx_cbc_decrypt(struct skcipher_request *req, + unsigned int bsize, sm4_crypt_func func) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); @@ -135,15 +140,14 @@ static int cbc_decrypt(struct skcipher_request *req)
kernel_fpu_begin();
- while (nbytes >= SM4_CRYPT8_BLOCK_SIZE) { - sm4_aesni_avx_cbc_dec_blk8(ctx->rkey_dec, dst, - src, walk.iv); - dst += SM4_CRYPT8_BLOCK_SIZE; - src += SM4_CRYPT8_BLOCK_SIZE; - nbytes -= SM4_CRYPT8_BLOCK_SIZE; + while (nbytes >= bsize) { + func(ctx->rkey_dec, dst, src, walk.iv); + dst += bsize; + src += bsize; + nbytes -= bsize; }
- if (nbytes >= SM4_BLOCK_SIZE) { + while (nbytes >= SM4_BLOCK_SIZE) { u8 keystream[SM4_BLOCK_SIZE * 8]; u8 iv[SM4_BLOCK_SIZE]; unsigned int nblocks = min(nbytes >> 4, 8u); @@ -165,6 +169,8 @@ static int cbc_decrypt(struct skcipher_request *req) } crypto_xor_cpy(dst, walk.iv, keystream, SM4_BLOCK_SIZE); memcpy(walk.iv, iv, SM4_BLOCK_SIZE); + dst += nblocks * SM4_BLOCK_SIZE; + src += (nblocks + 1) * SM4_BLOCK_SIZE; nbytes -= nblocks * SM4_BLOCK_SIZE; }
@@ -174,8 +180,15 @@ static int cbc_decrypt(struct skcipher_request *req)
return err; } +EXPORT_SYMBOL_GPL(sm4_avx_cbc_decrypt); + +static int cbc_decrypt(struct skcipher_request *req) +{ + return sm4_avx_cbc_decrypt(req, SM4_CRYPT8_BLOCK_SIZE, + sm4_aesni_avx_cbc_dec_blk8); +}
-static int cfb_encrypt(struct skcipher_request *req) +int sm4_cfb_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); @@ -214,8 +227,10 @@ static int cfb_encrypt(struct skcipher_request *req)
return err; } +EXPORT_SYMBOL_GPL(sm4_cfb_encrypt);
-static int cfb_decrypt(struct skcipher_request *req) +int sm4_avx_cfb_decrypt(struct skcipher_request *req, + unsigned int bsize, sm4_crypt_func func) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); @@ -231,15 +246,14 @@ static int cfb_decrypt(struct skcipher_request *req)
kernel_fpu_begin();
- while (nbytes >= SM4_CRYPT8_BLOCK_SIZE) { - sm4_aesni_avx_cfb_dec_blk8(ctx->rkey_enc, dst, - src, walk.iv); - dst += SM4_CRYPT8_BLOCK_SIZE; - src += SM4_CRYPT8_BLOCK_SIZE; - nbytes -= SM4_CRYPT8_BLOCK_SIZE; + while (nbytes >= bsize) { + func(ctx->rkey_enc, dst, src, walk.iv); + dst += bsize; + src += bsize; + nbytes -= bsize; }
- if (nbytes >= SM4_BLOCK_SIZE) { + while (nbytes >= SM4_BLOCK_SIZE) { u8 keystream[SM4_BLOCK_SIZE * 8]; unsigned int nblocks = min(nbytes >> 4, 8u);
@@ -276,8 +290,16 @@ static int cfb_decrypt(struct skcipher_request *req)
return err; } +EXPORT_SYMBOL_GPL(sm4_avx_cfb_decrypt);
-static int ctr_crypt(struct skcipher_request *req) +static int cfb_decrypt(struct skcipher_request *req) +{ + return sm4_avx_cfb_decrypt(req, SM4_CRYPT8_BLOCK_SIZE, + sm4_aesni_avx_cfb_dec_blk8); +} + +int sm4_avx_ctr_crypt(struct skcipher_request *req, + unsigned int bsize, sm4_crypt_func func) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); @@ -293,15 +315,14 @@ static int ctr_crypt(struct skcipher_request *req)
kernel_fpu_begin();
- while (nbytes >= SM4_CRYPT8_BLOCK_SIZE) { - sm4_aesni_avx_ctr_enc_blk8(ctx->rkey_enc, dst, - src, walk.iv); - dst += SM4_CRYPT8_BLOCK_SIZE; - src += SM4_CRYPT8_BLOCK_SIZE; - nbytes -= SM4_CRYPT8_BLOCK_SIZE; + while (nbytes >= bsize) { + func(ctx->rkey_enc, dst, src, walk.iv); + dst += bsize; + src += bsize; + nbytes -= bsize; }
- if (nbytes >= SM4_BLOCK_SIZE) { + while (nbytes >= SM4_BLOCK_SIZE) { u8 keystream[SM4_BLOCK_SIZE * 8]; unsigned int nblocks = min(nbytes >> 4, 8u); int i; @@ -343,6 +364,13 @@ static int ctr_crypt(struct skcipher_request *req)
return err; } +EXPORT_SYMBOL_GPL(sm4_avx_ctr_crypt); + +static int ctr_crypt(struct skcipher_request *req) +{ + return sm4_avx_ctr_crypt(req, SM4_CRYPT8_BLOCK_SIZE, + sm4_aesni_avx_ctr_enc_blk8); +}
static struct skcipher_alg sm4_aesni_avx_skciphers[] = { { @@ -359,8 +387,8 @@ static struct skcipher_alg sm4_aesni_avx_skciphers[] = { .max_keysize = SM4_KEY_SIZE, .walksize = 8 * SM4_BLOCK_SIZE, .setkey = sm4_skcipher_setkey, - .encrypt = ecb_encrypt, - .decrypt = ecb_decrypt, + .encrypt = sm4_avx_ecb_encrypt, + .decrypt = sm4_avx_ecb_decrypt, }, { .base = { .cra_name = "__cbc(sm4)", @@ -376,7 +404,7 @@ static struct skcipher_alg sm4_aesni_avx_skciphers[] = { .ivsize = SM4_BLOCK_SIZE, .walksize = 8 * SM4_BLOCK_SIZE, .setkey = sm4_skcipher_setkey, - .encrypt = cbc_encrypt, + .encrypt = sm4_cbc_encrypt, .decrypt = cbc_decrypt, }, { .base = { @@ -394,7 +422,7 @@ static struct skcipher_alg sm4_aesni_avx_skciphers[] = { .chunksize = SM4_BLOCK_SIZE, .walksize = 8 * SM4_BLOCK_SIZE, .setkey = sm4_skcipher_setkey, - .encrypt = cfb_encrypt, + .encrypt = sm4_cfb_encrypt, .decrypt = cfb_decrypt, }, { .base = {
From: 沈子俊 shenzijun@kylinos.cn
mainline inclusion from mainline-v5.15.4 commit 5b2efa2bb865eb784e06987c7ce98c3c835b495b category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4A82K?from=project-issue CVE: NA
-------------------------------------------------------------------
Like the implementation of AESNI/AVX, this patch adds an accelerated implementation of AESNI/AVX2. In terms of code implementation, by reusing AESNI/AVX mode-related codes, the amount of code is greatly reduced. From the benchmark data, it can be seen that when the block size is 1024, compared to AVX acceleration, the performance achieved by AVX2 has increased by about 70%, it is also 7.7 times of the pure software implementation of sm4-generic.
The main algorithm implementation comes from SM4 AES-NI work by libgcrypt and Markku-Juhani O. Saarinen at: https://github.com/mjosaarinen/sm4ni
This optimization supports the four modes of SM4, ECB, CBC, CFB, and CTR. Since CBC and CFB do not support multiple block parallel encryption, the optimization effect is not obvious.
Benchmark on Intel i5-6200U 2.30GHz, performance data of three implementation methods, pure software sm4-generic, aesni/avx acceleration, and aesni/avx2 acceleration, the data comes from the 218 mode and 518 mode of tcrypt. The abscissas are blocks of different lengths. The data is tabulated and the unit is Mb/s:
block-size | 16 64 128 256 1024 1420 4096 sm4-generic ECB enc | 60.94 70.41 72.27 73.02 73.87 73.58 73.59 ECB dec | 61.87 70.53 72.15 73.09 73.89 73.92 73.86 CBC enc | 56.71 66.31 68.05 69.84 70.02 70.12 70.24 CBC dec | 54.54 65.91 68.22 69.51 70.63 70.79 70.82 CFB enc | 57.21 67.24 69.10 70.25 70.73 70.52 71.42 CFB dec | 57.22 64.74 66.31 67.24 67.40 67.64 67.58 CTR enc | 59.47 68.64 69.91 71.02 71.86 71.61 71.95 CTR dec | 59.94 68.77 69.95 71.00 71.84 71.55 71.95 sm4-aesni-avx ECB enc | 44.95 177.35 292.06 316.98 339.48 322.27 330.59 ECB dec | 45.28 178.66 292.31 317.52 339.59 322.52 331.16 CBC enc | 57.75 67.68 69.72 70.60 71.48 71.63 71.74 CBC dec | 44.32 176.83 284.32 307.24 328.61 312.61 325.82 CFB enc | 57.81 67.64 69.63 70.55 71.40 71.35 71.70 CFB dec | 43.14 167.78 282.03 307.20 328.35 318.24 325.95 CTR enc | 42.35 163.32 279.11 302.93 320.86 310.56 317.93 CTR dec | 42.39 162.81 278.49 302.37 321.11 310.33 318.37 sm4-aesni-avx2 ECB enc | 45.19 177.41 292.42 316.12 339.90 322.53 330.54 ECB dec | 44.83 178.90 291.45 317.31 339.85 322.55 331.07 CBC enc | 57.66 67.62 69.73 70.55 71.58 71.66 71.77 CBC dec | 44.34 176.86 286.10 501.68 559.58 483.87 527.46 CFB enc | 57.43 67.60 69.61 70.52 71.43 71.28 71.65 CFB dec | 43.12 167.75 268.09 499.33 558.35 490.36 524.73 CTR enc | 42.42 163.39 256.17 493.95 552.45 481.58 517.19 CTR dec | 42.49 163.11 256.36 493.34 552.62 481.49 516.83
Signed-off-by: Tianjia Zhang tianjia.zhang@linux.alibaba.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Signed-off-by: 沈子俊 shenzijun@kylinos.cn Reviewed-by: weiyang wang wangweiyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/crypto/Makefile | 3 + arch/x86/crypto/sm4-aesni-avx2-asm_64.S | 497 ++++++++++++++++++++++++ arch/x86/crypto/sm4_aesni_avx2_glue.c | 169 ++++++++ crypto/Kconfig | 22 ++ 4 files changed, 691 insertions(+) create mode 100644 arch/x86/crypto/sm4-aesni-avx2-asm_64.S create mode 100644 arch/x86/crypto/sm4_aesni_avx2_glue.c
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 5535810beef5..697b8ccfb763 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -95,6 +95,9 @@ obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o obj-$(CONFIG_CRYPTO_SM4_AESNI_AVX_X86_64) += sm4-aesni-avx-x86_64.o sm4-aesni-avx-x86_64-y := sm4-aesni-avx-asm_64.o sm4_aesni_avx_glue.o
+obj-$(CONFIG_CRYPTO_SM4_AESNI_AVX2_X86_64) += sm4-aesni-avx2-x86_64.o +sm4-aesni-avx2-x86_64-y := sm4-aesni-avx2-asm_64.o sm4_aesni_avx2_glue.o + quiet_cmd_perlasm = PERLASM $@ cmd_perlasm = $(PERL) $< > $@ $(obj)/%.S: $(src)/%.pl FORCE diff --git a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S new file mode 100644 index 000000000000..d2ffd7f76ee2 --- /dev/null +++ b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S @@ -0,0 +1,497 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * SM4 Cipher Algorithm, AES-NI/AVX2 optimized. + * as specified in + * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html + * + * Copyright (C) 2018 Markku-Juhani O. Saarinen mjos@iki.fi + * Copyright (C) 2020 Jussi Kivilinna jussi.kivilinna@iki.fi + * Copyright (c) 2021 Tianjia Zhang tianjia.zhang@linux.alibaba.com + */ + +/* Based on SM4 AES-NI work by libgcrypt and Markku-Juhani O. Saarinen at: + * https://github.com/mjosaarinen/sm4ni + */ + +#include <linux/linkage.h> +#include <asm/frame.h> + +#define rRIP (%rip) + +/* vector registers */ +#define RX0 %ymm0 +#define RX1 %ymm1 +#define MASK_4BIT %ymm2 +#define RTMP0 %ymm3 +#define RTMP1 %ymm4 +#define RTMP2 %ymm5 +#define RTMP3 %ymm6 +#define RTMP4 %ymm7 + +#define RA0 %ymm8 +#define RA1 %ymm9 +#define RA2 %ymm10 +#define RA3 %ymm11 + +#define RB0 %ymm12 +#define RB1 %ymm13 +#define RB2 %ymm14 +#define RB3 %ymm15 + +#define RNOT %ymm0 +#define RBSWAP %ymm1 + +#define RX0x %xmm0 +#define RX1x %xmm1 +#define MASK_4BITx %xmm2 + +#define RNOTx %xmm0 +#define RBSWAPx %xmm1 + +#define RTMP0x %xmm3 +#define RTMP1x %xmm4 +#define RTMP2x %xmm5 +#define RTMP3x %xmm6 +#define RTMP4x %xmm7 + + +/* helper macros */ + +/* Transpose four 32-bit words between 128-bit vector lanes. */ +#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ + vpunpckhdq x1, x0, t2; \ + vpunpckldq x1, x0, x0; \ + \ + vpunpckldq x3, x2, t1; \ + vpunpckhdq x3, x2, x2; \ + \ + vpunpckhqdq t1, x0, x1; \ + vpunpcklqdq t1, x0, x0; \ + \ + vpunpckhqdq x2, t2, x3; \ + vpunpcklqdq x2, t2, x2; + +/* post-SubByte transform. */ +#define transform_pre(x, lo_t, hi_t, mask4bit, tmp0) \ + vpand x, mask4bit, tmp0; \ + vpandn x, mask4bit, x; \ + vpsrld $4, x, x; \ + \ + vpshufb tmp0, lo_t, tmp0; \ + vpshufb x, hi_t, x; \ + vpxor tmp0, x, x; + +/* post-SubByte transform. Note: x has been XOR'ed with mask4bit by + * 'vaeslastenc' instruction. */ +#define transform_post(x, lo_t, hi_t, mask4bit, tmp0) \ + vpandn mask4bit, x, tmp0; \ + vpsrld $4, x, x; \ + vpand x, mask4bit, x; \ + \ + vpshufb tmp0, lo_t, tmp0; \ + vpshufb x, hi_t, x; \ + vpxor tmp0, x, x; + + +.section .rodata.cst164, "aM", @progbits, 164 +.align 16 + +/* + * Following four affine transform look-up tables are from work by + * Markku-Juhani O. Saarinen, at https://github.com/mjosaarinen/sm4ni + * + * These allow exposing SM4 S-Box from AES SubByte. + */ + +/* pre-SubByte affine transform, from SM4 field to AES field. */ +.Lpre_tf_lo_s: + .quad 0x9197E2E474720701, 0xC7C1B4B222245157 +.Lpre_tf_hi_s: + .quad 0xE240AB09EB49A200, 0xF052B91BF95BB012 + +/* post-SubByte affine transform, from AES field to SM4 field. */ +.Lpost_tf_lo_s: + .quad 0x5B67F2CEA19D0834, 0xEDD14478172BBE82 +.Lpost_tf_hi_s: + .quad 0xAE7201DD73AFDC00, 0x11CDBE62CC1063BF + +/* For isolating SubBytes from AESENCLAST, inverse shift row */ +.Linv_shift_row: + .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b + .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 + +/* Inverse shift row + Rotate left by 8 bits on 32-bit words with vpshufb */ +.Linv_shift_row_rol_8: + .byte 0x07, 0x00, 0x0d, 0x0a, 0x0b, 0x04, 0x01, 0x0e + .byte 0x0f, 0x08, 0x05, 0x02, 0x03, 0x0c, 0x09, 0x06 + +/* Inverse shift row + Rotate left by 16 bits on 32-bit words with vpshufb */ +.Linv_shift_row_rol_16: + .byte 0x0a, 0x07, 0x00, 0x0d, 0x0e, 0x0b, 0x04, 0x01 + .byte 0x02, 0x0f, 0x08, 0x05, 0x06, 0x03, 0x0c, 0x09 + +/* Inverse shift row + Rotate left by 24 bits on 32-bit words with vpshufb */ +.Linv_shift_row_rol_24: + .byte 0x0d, 0x0a, 0x07, 0x00, 0x01, 0x0e, 0x0b, 0x04 + .byte 0x05, 0x02, 0x0f, 0x08, 0x09, 0x06, 0x03, 0x0c + +/* For CTR-mode IV byteswap */ +.Lbswap128_mask: + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +/* For input word byte-swap */ +.Lbswap32_mask: + .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 + +.align 4 +/* 4-bit mask */ +.L0f0f0f0f: + .long 0x0f0f0f0f + +.text +.align 16 + +.align 8 +SYM_FUNC_START_LOCAL(__sm4_crypt_blk16) + /* input: + * %rdi: round key array, CTX + * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel + * plaintext blocks + * output: + * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel + * ciphertext blocks + */ + FRAME_BEGIN + + vbroadcasti128 .Lbswap32_mask rRIP, RTMP2; + vpshufb RTMP2, RA0, RA0; + vpshufb RTMP2, RA1, RA1; + vpshufb RTMP2, RA2, RA2; + vpshufb RTMP2, RA3, RA3; + vpshufb RTMP2, RB0, RB0; + vpshufb RTMP2, RB1, RB1; + vpshufb RTMP2, RB2, RB2; + vpshufb RTMP2, RB3, RB3; + + vpbroadcastd .L0f0f0f0f rRIP, MASK_4BIT; + transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); + transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1); + +#define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3) \ + vpbroadcastd (4*(round))(%rdi), RX0; \ + vbroadcasti128 .Lpre_tf_lo_s rRIP, RTMP4; \ + vbroadcasti128 .Lpre_tf_hi_s rRIP, RTMP1; \ + vmovdqa RX0, RX1; \ + vpxor s1, RX0, RX0; \ + vpxor s2, RX0, RX0; \ + vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \ + vbroadcasti128 .Lpost_tf_lo_s rRIP, RTMP2; \ + vbroadcasti128 .Lpost_tf_hi_s rRIP, RTMP3; \ + vpxor r1, RX1, RX1; \ + vpxor r2, RX1, RX1; \ + vpxor r3, RX1, RX1; /* r1 ^ r2 ^ r3 ^ rk */ \ + \ + /* sbox, non-linear part */ \ + transform_pre(RX0, RTMP4, RTMP1, MASK_4BIT, RTMP0); \ + transform_pre(RX1, RTMP4, RTMP1, MASK_4BIT, RTMP0); \ + vextracti128 $1, RX0, RTMP4x; \ + vextracti128 $1, RX1, RTMP0x; \ + vaesenclast MASK_4BITx, RX0x, RX0x; \ + vaesenclast MASK_4BITx, RTMP4x, RTMP4x; \ + vaesenclast MASK_4BITx, RX1x, RX1x; \ + vaesenclast MASK_4BITx, RTMP0x, RTMP0x; \ + vinserti128 $1, RTMP4x, RX0, RX0; \ + vbroadcasti128 .Linv_shift_row rRIP, RTMP4; \ + vinserti128 $1, RTMP0x, RX1, RX1; \ + transform_post(RX0, RTMP2, RTMP3, MASK_4BIT, RTMP0); \ + transform_post(RX1, RTMP2, RTMP3, MASK_4BIT, RTMP0); \ + \ + /* linear part */ \ + vpshufb RTMP4, RX0, RTMP0; \ + vpxor RTMP0, s0, s0; /* s0 ^ x */ \ + vpshufb RTMP4, RX1, RTMP2; \ + vbroadcasti128 .Linv_shift_row_rol_8 rRIP, RTMP4; \ + vpxor RTMP2, r0, r0; /* r0 ^ x */ \ + vpshufb RTMP4, RX0, RTMP1; \ + vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */ \ + vpshufb RTMP4, RX1, RTMP3; \ + vbroadcasti128 .Linv_shift_row_rol_16 rRIP, RTMP4; \ + vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) */ \ + vpshufb RTMP4, RX0, RTMP1; \ + vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \ + vpshufb RTMP4, RX1, RTMP3; \ + vbroadcasti128 .Linv_shift_row_rol_24 rRIP, RTMP4; \ + vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) ^ rol(x,16) */ \ + vpshufb RTMP4, RX0, RTMP1; \ + vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \ + vpslld $2, RTMP0, RTMP1; \ + vpsrld $30, RTMP0, RTMP0; \ + vpxor RTMP0, s0, s0; \ + /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ + vpxor RTMP1, s0, s0; \ + vpshufb RTMP4, RX1, RTMP3; \ + vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x,24) */ \ + vpslld $2, RTMP2, RTMP3; \ + vpsrld $30, RTMP2, RTMP2; \ + vpxor RTMP2, r0, r0; \ + /* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ + vpxor RTMP3, r0, r0; + + leaq (32*4)(%rdi), %rax; +.align 16 +.Lroundloop_blk8: + ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3); + ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, RB3, RB0); + ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, RB0, RB1); + ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, RB1, RB2); + leaq (4*4)(%rdi), %rdi; + cmpq %rax, %rdi; + jne .Lroundloop_blk8; + +#undef ROUND + + vbroadcasti128 .Lbswap128_mask rRIP, RTMP2; + + transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); + transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1); + vpshufb RTMP2, RA0, RA0; + vpshufb RTMP2, RA1, RA1; + vpshufb RTMP2, RA2, RA2; + vpshufb RTMP2, RA3, RA3; + vpshufb RTMP2, RB0, RB0; + vpshufb RTMP2, RB1, RB1; + vpshufb RTMP2, RB2, RB2; + vpshufb RTMP2, RB3, RB3; + + FRAME_END + ret; +SYM_FUNC_END(__sm4_crypt_blk16) + +#define inc_le128(x, minus_one, tmp) \ + vpcmpeqq minus_one, x, tmp; \ + vpsubq minus_one, x, x; \ + vpslldq $8, tmp, tmp; \ + vpsubq tmp, x, x; + +/* + * void sm4_aesni_avx2_ctr_enc_blk16(const u32 *rk, u8 *dst, + * const u8 *src, u8 *iv) + */ +.align 8 +SYM_FUNC_START(sm4_aesni_avx2_ctr_enc_blk16) + /* input: + * %rdi: round key array, CTX + * %rsi: dst (16 blocks) + * %rdx: src (16 blocks) + * %rcx: iv (big endian, 128bit) + */ + FRAME_BEGIN + + movq 8(%rcx), %rax; + bswapq %rax; + + vzeroupper; + + vbroadcasti128 .Lbswap128_mask rRIP, RTMP3; + vpcmpeqd RNOT, RNOT, RNOT; + vpsrldq $8, RNOT, RNOT; /* ab: -1:0 ; cd: -1:0 */ + vpaddq RNOT, RNOT, RTMP2; /* ab: -2:0 ; cd: -2:0 */ + + /* load IV and byteswap */ + vmovdqu (%rcx), RTMP4x; + vpshufb RTMP3x, RTMP4x, RTMP4x; + vmovdqa RTMP4x, RTMP0x; + inc_le128(RTMP4x, RNOTx, RTMP1x); + vinserti128 $1, RTMP4x, RTMP0, RTMP0; + vpshufb RTMP3, RTMP0, RA0; /* +1 ; +0 */ + + /* check need for handling 64-bit overflow and carry */ + cmpq $(0xffffffffffffffff - 16), %rax; + ja .Lhandle_ctr_carry; + + /* construct IVs */ + vpsubq RTMP2, RTMP0, RTMP0; /* +3 ; +2 */ + vpshufb RTMP3, RTMP0, RA1; + vpsubq RTMP2, RTMP0, RTMP0; /* +5 ; +4 */ + vpshufb RTMP3, RTMP0, RA2; + vpsubq RTMP2, RTMP0, RTMP0; /* +7 ; +6 */ + vpshufb RTMP3, RTMP0, RA3; + vpsubq RTMP2, RTMP0, RTMP0; /* +9 ; +8 */ + vpshufb RTMP3, RTMP0, RB0; + vpsubq RTMP2, RTMP0, RTMP0; /* +11 ; +10 */ + vpshufb RTMP3, RTMP0, RB1; + vpsubq RTMP2, RTMP0, RTMP0; /* +13 ; +12 */ + vpshufb RTMP3, RTMP0, RB2; + vpsubq RTMP2, RTMP0, RTMP0; /* +15 ; +14 */ + vpshufb RTMP3, RTMP0, RB3; + vpsubq RTMP2, RTMP0, RTMP0; /* +16 */ + vpshufb RTMP3x, RTMP0x, RTMP0x; + + jmp .Lctr_carry_done; + +.Lhandle_ctr_carry: + /* construct IVs */ + inc_le128(RTMP0, RNOT, RTMP1); + inc_le128(RTMP0, RNOT, RTMP1); + vpshufb RTMP3, RTMP0, RA1; /* +3 ; +2 */ + inc_le128(RTMP0, RNOT, RTMP1); + inc_le128(RTMP0, RNOT, RTMP1); + vpshufb RTMP3, RTMP0, RA2; /* +5 ; +4 */ + inc_le128(RTMP0, RNOT, RTMP1); + inc_le128(RTMP0, RNOT, RTMP1); + vpshufb RTMP3, RTMP0, RA3; /* +7 ; +6 */ + inc_le128(RTMP0, RNOT, RTMP1); + inc_le128(RTMP0, RNOT, RTMP1); + vpshufb RTMP3, RTMP0, RB0; /* +9 ; +8 */ + inc_le128(RTMP0, RNOT, RTMP1); + inc_le128(RTMP0, RNOT, RTMP1); + vpshufb RTMP3, RTMP0, RB1; /* +11 ; +10 */ + inc_le128(RTMP0, RNOT, RTMP1); + inc_le128(RTMP0, RNOT, RTMP1); + vpshufb RTMP3, RTMP0, RB2; /* +13 ; +12 */ + inc_le128(RTMP0, RNOT, RTMP1); + inc_le128(RTMP0, RNOT, RTMP1); + vpshufb RTMP3, RTMP0, RB3; /* +15 ; +14 */ + inc_le128(RTMP0, RNOT, RTMP1); + vextracti128 $1, RTMP0, RTMP0x; + vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 */ + +.align 4 +.Lctr_carry_done: + /* store new IV */ + vmovdqu RTMP0x, (%rcx); + + call __sm4_crypt_blk16; + + vpxor (0 * 32)(%rdx), RA0, RA0; + vpxor (1 * 32)(%rdx), RA1, RA1; + vpxor (2 * 32)(%rdx), RA2, RA2; + vpxor (3 * 32)(%rdx), RA3, RA3; + vpxor (4 * 32)(%rdx), RB0, RB0; + vpxor (5 * 32)(%rdx), RB1, RB1; + vpxor (6 * 32)(%rdx), RB2, RB2; + vpxor (7 * 32)(%rdx), RB3, RB3; + + vmovdqu RA0, (0 * 32)(%rsi); + vmovdqu RA1, (1 * 32)(%rsi); + vmovdqu RA2, (2 * 32)(%rsi); + vmovdqu RA3, (3 * 32)(%rsi); + vmovdqu RB0, (4 * 32)(%rsi); + vmovdqu RB1, (5 * 32)(%rsi); + vmovdqu RB2, (6 * 32)(%rsi); + vmovdqu RB3, (7 * 32)(%rsi); + + vzeroall; + FRAME_END + ret; +SYM_FUNC_END(sm4_aesni_avx2_ctr_enc_blk16) + +/* + * void sm4_aesni_avx2_cbc_dec_blk16(const u32 *rk, u8 *dst, + * const u8 *src, u8 *iv) + */ +.align 8 +SYM_FUNC_START(sm4_aesni_avx2_cbc_dec_blk16) + /* input: + * %rdi: round key array, CTX + * %rsi: dst (16 blocks) + * %rdx: src (16 blocks) + * %rcx: iv + */ + FRAME_BEGIN + + vzeroupper; + + vmovdqu (0 * 32)(%rdx), RA0; + vmovdqu (1 * 32)(%rdx), RA1; + vmovdqu (2 * 32)(%rdx), RA2; + vmovdqu (3 * 32)(%rdx), RA3; + vmovdqu (4 * 32)(%rdx), RB0; + vmovdqu (5 * 32)(%rdx), RB1; + vmovdqu (6 * 32)(%rdx), RB2; + vmovdqu (7 * 32)(%rdx), RB3; + + call __sm4_crypt_blk16; + + vmovdqu (%rcx), RNOTx; + vinserti128 $1, (%rdx), RNOT, RNOT; + vpxor RNOT, RA0, RA0; + vpxor (0 * 32 + 16)(%rdx), RA1, RA1; + vpxor (1 * 32 + 16)(%rdx), RA2, RA2; + vpxor (2 * 32 + 16)(%rdx), RA3, RA3; + vpxor (3 * 32 + 16)(%rdx), RB0, RB0; + vpxor (4 * 32 + 16)(%rdx), RB1, RB1; + vpxor (5 * 32 + 16)(%rdx), RB2, RB2; + vpxor (6 * 32 + 16)(%rdx), RB3, RB3; + vmovdqu (7 * 32 + 16)(%rdx), RNOTx; + vmovdqu RNOTx, (%rcx); /* store new IV */ + + vmovdqu RA0, (0 * 32)(%rsi); + vmovdqu RA1, (1 * 32)(%rsi); + vmovdqu RA2, (2 * 32)(%rsi); + vmovdqu RA3, (3 * 32)(%rsi); + vmovdqu RB0, (4 * 32)(%rsi); + vmovdqu RB1, (5 * 32)(%rsi); + vmovdqu RB2, (6 * 32)(%rsi); + vmovdqu RB3, (7 * 32)(%rsi); + + vzeroall; + FRAME_END + ret; +SYM_FUNC_END(sm4_aesni_avx2_cbc_dec_blk16) + +/* + * void sm4_aesni_avx2_cfb_dec_blk16(const u32 *rk, u8 *dst, + * const u8 *src, u8 *iv) + */ +.align 8 +SYM_FUNC_START(sm4_aesni_avx2_cfb_dec_blk16) + /* input: + * %rdi: round key array, CTX + * %rsi: dst (16 blocks) + * %rdx: src (16 blocks) + * %rcx: iv + */ + FRAME_BEGIN + + vzeroupper; + + /* Load input */ + vmovdqu (%rcx), RNOTx; + vinserti128 $1, (%rdx), RNOT, RA0; + vmovdqu (0 * 32 + 16)(%rdx), RA1; + vmovdqu (1 * 32 + 16)(%rdx), RA2; + vmovdqu (2 * 32 + 16)(%rdx), RA3; + vmovdqu (3 * 32 + 16)(%rdx), RB0; + vmovdqu (4 * 32 + 16)(%rdx), RB1; + vmovdqu (5 * 32 + 16)(%rdx), RB2; + vmovdqu (6 * 32 + 16)(%rdx), RB3; + + /* Update IV */ + vmovdqu (7 * 32 + 16)(%rdx), RNOTx; + vmovdqu RNOTx, (%rcx); + + call __sm4_crypt_blk16; + + vpxor (0 * 32)(%rdx), RA0, RA0; + vpxor (1 * 32)(%rdx), RA1, RA1; + vpxor (2 * 32)(%rdx), RA2, RA2; + vpxor (3 * 32)(%rdx), RA3, RA3; + vpxor (4 * 32)(%rdx), RB0, RB0; + vpxor (5 * 32)(%rdx), RB1, RB1; + vpxor (6 * 32)(%rdx), RB2, RB2; + vpxor (7 * 32)(%rdx), RB3, RB3; + + vmovdqu RA0, (0 * 32)(%rsi); + vmovdqu RA1, (1 * 32)(%rsi); + vmovdqu RA2, (2 * 32)(%rsi); + vmovdqu RA3, (3 * 32)(%rsi); + vmovdqu RB0, (4 * 32)(%rsi); + vmovdqu RB1, (5 * 32)(%rsi); + vmovdqu RB2, (6 * 32)(%rsi); + vmovdqu RB3, (7 * 32)(%rsi); + + vzeroall; + FRAME_END + ret; +SYM_FUNC_END(sm4_aesni_avx2_cfb_dec_blk16) diff --git a/arch/x86/crypto/sm4_aesni_avx2_glue.c b/arch/x86/crypto/sm4_aesni_avx2_glue.c new file mode 100644 index 000000000000..84bc718f49a3 --- /dev/null +++ b/arch/x86/crypto/sm4_aesni_avx2_glue.c @@ -0,0 +1,169 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SM4 Cipher Algorithm, AES-NI/AVX2 optimized. + * as specified in + * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html + * + * Copyright (c) 2021, Alibaba Group. + * Copyright (c) 2021 Tianjia Zhang tianjia.zhang@linux.alibaba.com + */ + +#include <linux/module.h> +#include <linux/crypto.h> +#include <linux/kernel.h> +#include <asm/simd.h> +#include <crypto/internal/simd.h> +#include <crypto/internal/skcipher.h> +#include <crypto/sm4.h> +#include "sm4-avx.h" + +#define SM4_CRYPT16_BLOCK_SIZE (SM4_BLOCK_SIZE * 16) + +asmlinkage void sm4_aesni_avx2_ctr_enc_blk16(const u32 *rk, u8 *dst, + const u8 *src, u8 *iv); +asmlinkage void sm4_aesni_avx2_cbc_dec_blk16(const u32 *rk, u8 *dst, + const u8 *src, u8 *iv); +asmlinkage void sm4_aesni_avx2_cfb_dec_blk16(const u32 *rk, u8 *dst, + const u8 *src, u8 *iv); + +static int sm4_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int key_len) +{ + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + + return sm4_expandkey(ctx, key, key_len); +} + +static int cbc_decrypt(struct skcipher_request *req) +{ + return sm4_avx_cbc_decrypt(req, SM4_CRYPT16_BLOCK_SIZE, + sm4_aesni_avx2_cbc_dec_blk16); +} + + +static int cfb_decrypt(struct skcipher_request *req) +{ + return sm4_avx_cfb_decrypt(req, SM4_CRYPT16_BLOCK_SIZE, + sm4_aesni_avx2_cfb_dec_blk16); +} + +static int ctr_crypt(struct skcipher_request *req) +{ + return sm4_avx_ctr_crypt(req, SM4_CRYPT16_BLOCK_SIZE, + sm4_aesni_avx2_ctr_enc_blk16); +} + +static struct skcipher_alg sm4_aesni_avx2_skciphers[] = { + { + .base = { + .cra_name = "__ecb(sm4)", + .cra_driver_name = "__ecb-sm4-aesni-avx2", + .cra_priority = 500, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = SM4_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .walksize = 16 * SM4_BLOCK_SIZE, + .setkey = sm4_skcipher_setkey, + .encrypt = sm4_avx_ecb_encrypt, + .decrypt = sm4_avx_ecb_decrypt, + }, { + .base = { + .cra_name = "__cbc(sm4)", + .cra_driver_name = "__cbc-sm4-aesni-avx2", + .cra_priority = 500, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = SM4_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .ivsize = SM4_BLOCK_SIZE, + .walksize = 16 * SM4_BLOCK_SIZE, + .setkey = sm4_skcipher_setkey, + .encrypt = sm4_cbc_encrypt, + .decrypt = cbc_decrypt, + }, { + .base = { + .cra_name = "__cfb(sm4)", + .cra_driver_name = "__cfb-sm4-aesni-avx2", + .cra_priority = 500, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .ivsize = SM4_BLOCK_SIZE, + .chunksize = SM4_BLOCK_SIZE, + .walksize = 16 * SM4_BLOCK_SIZE, + .setkey = sm4_skcipher_setkey, + .encrypt = sm4_cfb_encrypt, + .decrypt = cfb_decrypt, + }, { + .base = { + .cra_name = "__ctr(sm4)", + .cra_driver_name = "__ctr-sm4-aesni-avx2", + .cra_priority = 500, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .ivsize = SM4_BLOCK_SIZE, + .chunksize = SM4_BLOCK_SIZE, + .walksize = 16 * SM4_BLOCK_SIZE, + .setkey = sm4_skcipher_setkey, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + } +}; + +static struct simd_skcipher_alg * +simd_sm4_aesni_avx2_skciphers[ARRAY_SIZE(sm4_aesni_avx2_skciphers)]; + +static int __init sm4_init(void) +{ + const char *feature_name; + + if (!boot_cpu_has(X86_FEATURE_AVX) || + !boot_cpu_has(X86_FEATURE_AVX2) || + !boot_cpu_has(X86_FEATURE_AES) || + !boot_cpu_has(X86_FEATURE_OSXSAVE)) { + pr_info("AVX2 or AES-NI instructions are not detected.\n"); + return -ENODEV; + } + + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, + &feature_name)) { + pr_info("CPU feature '%s' is not supported.\n", feature_name); + return -ENODEV; + } + + return simd_register_skciphers_compat(sm4_aesni_avx2_skciphers, + ARRAY_SIZE(sm4_aesni_avx2_skciphers), + simd_sm4_aesni_avx2_skciphers); +} + +static void __exit sm4_exit(void) +{ + simd_unregister_skciphers(sm4_aesni_avx2_skciphers, + ARRAY_SIZE(sm4_aesni_avx2_skciphers), + simd_sm4_aesni_avx2_skciphers); +} + +module_init(sm4_init); +module_exit(sm4_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Tianjia Zhang tianjia.zhang@linux.alibaba.com"); +MODULE_DESCRIPTION("SM4 Cipher Algorithm, AES-NI/AVX2 optimized"); +MODULE_ALIAS_CRYPTO("sm4"); +MODULE_ALIAS_CRYPTO("sm4-aesni-avx2"); diff --git a/crypto/Kconfig b/crypto/Kconfig index 9f4ea9c26652..9a4878cb0141 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1663,6 +1663,28 @@ config CRYPTO_SM4_AESNI_AVX_X86_64
If unsure, say N.
+config CRYPTO_SM4_AESNI_AVX2_X86_64 + tristate "SM4 cipher algorithm (x86_64/AES-NI/AVX2)" + depends on X86 && 64BIT + select CRYPTO_SKCIPHER + select CRYPTO_SIMD + select CRYPTO_ALGAPI + select CRYPTO_LIB_SM4 + select CRYPTO_SM4_AESNI_AVX_X86_64 + help + SM4 cipher algorithms (OSCCA GB/T 32907-2016) (x86_64/AES-NI/AVX2). + + SM4 (GBT.32907-2016) is a cryptographic standard issued by the + Organization of State Commercial Administration of China (OSCCA) + as an authorized cryptographic algorithms for the use within China. + + This is SM4 optimized implementation using AES-NI/AVX2/x86_64 + instruction set for block cipher. Through two affine transforms, + we can use the AES S-Box to simulate the SM4 S-Box to achieve the + effect of instruction acceleration. + + If unsure, say N. + config CRYPTO_TEA tristate "TEA, XTEA and XETA cipher algorithms" depends on CRYPTO_USER_API_ENABLE_OBSOLETE
From: 沈子俊 shenzijun@kylinos.cn
kylin inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4A842?from=project-issue CVE: NA
-------------------------------------------------------------------------
Add the configuration in arch/x86/configs/openeuler_defconfig
Signed-off-by: 沈子俊 shenzijun@kylinos.cn Reviewed-by: weiyang wang wangweiyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/configs/openeuler_defconfig | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 7b608301823c..e0dc1686984d 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -8001,6 +8001,8 @@ CONFIG_CRYPTO_SERPENT_SSE2_X86_64=m CONFIG_CRYPTO_SERPENT_AVX_X86_64=m CONFIG_CRYPTO_SERPENT_AVX2_X86_64=m CONFIG_CRYPTO_SM4=m +CONFIG_CRYPTO_SM4_AESNI_AVX_X86_64=m +CONFIG_CRYPTO_SM4_AESNI_AVX2_X86_64=m CONFIG_CRYPTO_TEA=m CONFIG_CRYPTO_TWOFISH=m CONFIG_CRYPTO_TWOFISH_COMMON=m
From: 沈子俊 shenzijun@kylinos.cn
mainline inclusion from mainline-v5.2-rc1 commit 0e14ef38669ce4faa80589247fe8ed8a3780f414 category: bugfix bugzilla: NA CVE: NA
-----------------------------------------------------------------------
sm4_aesni_avx_crypt8() sets up the frame pointer (which includes pushing RBP) before doing a conditional sibling call to sm4_aesni_avx_crypt4(), which sets up an additional frame pointer. Things will not go well when sm4_aesni_avx_crypt4() pops only the innermost single frame pointer and then tries to return to the outermost frame pointer.
Sibling calls need to occur with an empty stack frame. Do the conditional sibling call *before* setting up the stack pointer.
This fixes the following warning:
arch/x86/crypto/sm4-aesni-avx-asm_64.o: warning: objtool: sm4_aesni_avx_crypt8()+0x8: sibling call from callable instruction with modified stack frame
Fixes: a7ee22ee1445 ("crypto: x86/sm4 - add AES-NI/AVX/x86_64 implementation") Reported-by: kernel test robot lkp@intel.com Reported-by: Arnd Bergmann arnd@kernel.org Acked-by: Peter Zijlstra (Intel) peterz@infradead.org Reviewed-by: Tianjia Zhang tianjia.zhang@linux.alibaba.com Signed-off-by: Josh Poimboeuf jpoimboe@redhat.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Signed-off-by: 沈子俊 shenzijun@kylinos.cn Reviewed-by: weiyang wang wangweiyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/crypto/sm4-aesni-avx-asm_64.S | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/arch/x86/crypto/sm4-aesni-avx-asm_64.S b/arch/x86/crypto/sm4-aesni-avx-asm_64.S index fa2c3f50aecb..18d2f5199194 100644 --- a/arch/x86/crypto/sm4-aesni-avx-asm_64.S +++ b/arch/x86/crypto/sm4-aesni-avx-asm_64.S @@ -367,10 +367,11 @@ SYM_FUNC_START(sm4_aesni_avx_crypt8) * %rdx: src (1..8 blocks) * %rcx: num blocks (1..8) */ - FRAME_BEGIN - cmpq $5, %rcx; jb sm4_aesni_avx_crypt4; + + FRAME_BEGIN + vmovdqu (0 * 16)(%rdx), RA0; vmovdqu (1 * 16)(%rdx), RA1; vmovdqu (2 * 16)(%rdx), RA2;
From: 沈子俊 shenzijun@kylinos.cn
mainline inclusion from mainline-v5.16 commit 4a7e1e5fc294687a8941fa3eeb4a7e8539ca5e2f category: bugfix bugzilla: NA CVE: NA
-----------------------------------------------------------------
When building with clang and GNU as, there is a warning about ignored changed section attributes:
/tmp/sm4-c916c8.s: Assembler messages: /tmp/sm4-c916c8.s:677: Warning: ignoring changed section attributes for .data..cacheline_aligned
"static const" places the data in .rodata but __cacheline_aligned has the section attribute to place it in .data..cacheline_aligned, in addition to the aligned attribute.
To keep the alignment but avoid attempting to change sections, use the ____cacheline_aligned attribute, which is just the aligned attribute.
Fixes: 2b31277af577 ("crypto: sm4 - create SM4 library based on sm4 generic code") Link: https://github.com/ClangBuiltLinux/linux/issues/1441 Signed-off-by: Nathan Chancellor nathan@kernel.org Reviewed-by: Tianjia Zhang tianjia.zhang@linux.alibaba.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Signed-off-by: 沈子俊 shenzijun@kylinos.cn Reviewed-by: weiyang wang wangweiyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- lib/crypto/sm4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/lib/crypto/sm4.c b/lib/crypto/sm4.c index 633b59fed9db..284e62576d0c 100644 --- a/lib/crypto/sm4.c +++ b/lib/crypto/sm4.c @@ -15,7 +15,7 @@ static const u32 fk[4] = { 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc };
-static const u32 __cacheline_aligned ck[32] = { +static const u32 ____cacheline_aligned ck[32] = { 0x00070e15, 0x1c232a31, 0x383f464d, 0x545b6269, 0x70777e85, 0x8c939aa1, 0xa8afb6bd, 0xc4cbd2d9, 0xe0e7eef5, 0xfc030a11, 0x181f262d, 0x343b4249, @@ -26,7 +26,7 @@ static const u32 __cacheline_aligned ck[32] = { 0x10171e25, 0x2c333a41, 0x484f565d, 0x646b7279 };
-static const u8 __cacheline_aligned sbox[256] = { +static const u8 ____cacheline_aligned sbox[256] = { 0xd6, 0x90, 0xe9, 0xfe, 0xcc, 0xe1, 0x3d, 0xb7, 0x16, 0xb6, 0x14, 0xc2, 0x28, 0xfb, 0x2c, 0x05, 0x2b, 0x67, 0x9a, 0x76, 0x2a, 0xbe, 0x04, 0xc3,
From: 沈子俊 shenzijun@kylinos.cn
mainline inclusion from mainline-v5.15 commit f8690a4b5a1b64f74ae5c4f7c4ea880d8a8e1a0d category: bugfix bugzilla: NA CVE: NA
--------------------------------------------------------------
This fixes the following warning:
vmlinux.o: warning: objtool: elf_update: invalid section entry size
The size of the rodata section is 164 bytes, directly using the entry_size of 164 bytes will cause errors in some versions of the gcc compiler, while using 16 bytes directly will cause errors in the clang compiler. This patch correct it by filling the size of rodata to a 16-byte boundary.
Fixes: a7ee22ee1445 ("crypto: x86/sm4 - add AES-NI/AVX/x86_64 implementation") Fixes: 5b2efa2bb865 ("crypto: x86/sm4 - add AES-NI/AVX2/x86_64 implementation") Reported-by: Peter Zijlstra peterz@infradead.org Reported-by: Abaci Robot abaci@linux.alibaba.com Signed-off-by: Tianjia Zhang tianjia.zhang@linux.alibaba.com Tested-by: Heyuan Shi heyuan@linux.alibaba.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Signed-off-by: 沈子俊 shenzijun@kylinos.cn Reviewed-by: weiyang wang wangweiyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/crypto/sm4-aesni-avx-asm_64.S | 6 +++++- arch/x86/crypto/sm4-aesni-avx2-asm_64.S | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-)
diff --git a/arch/x86/crypto/sm4-aesni-avx-asm_64.S b/arch/x86/crypto/sm4-aesni-avx-asm_64.S index 18d2f5199194..1cc72b4804fa 100644 --- a/arch/x86/crypto/sm4-aesni-avx-asm_64.S +++ b/arch/x86/crypto/sm4-aesni-avx-asm_64.S @@ -78,7 +78,7 @@ vpxor tmp0, x, x;
-.section .rodata.cst164, "aM", @progbits, 164 +.section .rodata.cst16, "aM", @progbits, 16 .align 16
/* @@ -133,6 +133,10 @@ .L0f0f0f0f: .long 0x0f0f0f0f
+/* 12 bytes, only for padding */ +.Lpadding_deadbeef: + .long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef +
.text .align 16 diff --git a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S index d2ffd7f76ee2..9c5d3f3ad45a 100644 --- a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S @@ -93,7 +93,7 @@ vpxor tmp0, x, x;
-.section .rodata.cst164, "aM", @progbits, 164 +.section .rodata.cst16, "aM", @progbits, 16 .align 16
/* @@ -148,6 +148,10 @@ .L0f0f0f0f: .long 0x0f0f0f0f
+/* 12 bytes, only for padding */ +.Lpadding_deadbeef: + .long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef + .text .align 16
From: Yunjian Wang wangyunjian@huawei.com
mainline inclusion from mainline-5.12-rc1 commit dc9c9e72ff3ba01ae63e6263ac26234ba1869cd7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4MFVB CVE: NA
-------------------------------------------------
Currently the driver doesn't drop a packet which can't be sent by tun (e.g bad packet). In this case, the driver will always process the same packet lead to the tx queue stuck.
To fix this issue: 1. in the case of persistent failure (e.g bad packet), the driver can skip this descriptor by ignoring the error. 2. in the case of transient failure (e.g -ENOBUFS, -EAGAIN and -ENOMEM), the driver schedules the worker to try again.
Signed-off-by: Yunjian Wang wangyunjian@huawei.com Acked-by: Jason Wang jasowang@redhat.com Acked-by: Willem de Bruijn willemb@google.com Acked-by: Michael S. Tsirkin mst@redhat.com Link: https://lore.kernel.org/r/1610685980-38608-1-git-send-email-wangyunjian@huaw... Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/vhost/net.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index da02c3e96e7b..666c2d5b6673 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -836,14 +836,15 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) msg.msg_flags &= ~MSG_MORE; }
- /* TODO: Check specific error and bomb out unless ENOBUFS? */ err = sock->ops->sendmsg(sock, &msg, len); if (unlikely(err < 0)) { - vhost_discard_vq_desc(vq, 1); - vhost_net_enable_vq(net, vq); - break; - } - if (err != len) + if (err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS) { + vhost_discard_vq_desc(vq, 1); + vhost_net_enable_vq(net, vq); + break; + } + pr_debug("Fail to send packet: err %d", err); + } else if (unlikely(err != len)) pr_debug("Truncated TX packet: len %d != %zd\n", err, len); done: @@ -931,7 +932,6 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) msg.msg_flags &= ~MSG_MORE; }
- /* TODO: Check specific error and bomb out unless ENOBUFS? */ err = sock->ops->sendmsg(sock, &msg, len); if (unlikely(err < 0)) { if (zcopy_used) { @@ -940,11 +940,13 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) nvq->upend_idx = ((unsigned)nvq->upend_idx - 1) % UIO_MAXIOV; } - vhost_discard_vq_desc(vq, 1); - vhost_net_enable_vq(net, vq); - break; - } - if (err != len) + if (err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS) { + vhost_discard_vq_desc(vq, 1); + vhost_net_enable_vq(net, vq); + break; + } + pr_debug("Fail to send packet: err %d", err); + } else if (unlikely(err != len)) pr_debug("Truncated TX packet: " " len %d != %zd\n", err, len); if (!zcopy_used)
From: Hao Chen chenhao288@hisilicon.com
mainline inclusion from mainline-master commit 448f413a8bdc category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Add support for ethtool to set/get tx copybreak buf size.
Signed-off-by: Hao Chen chenhao288@hisilicon.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/uapi/linux/ethtool.h | 1 + net/ethtool/common.c | 1 + net/ethtool/ioctl.c | 1 + 3 files changed, 3 insertions(+)
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 3d318be2437c..53757e696fdc 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -221,6 +221,7 @@ enum tunable_id { ETHTOOL_RX_COPYBREAK, ETHTOOL_TX_COPYBREAK, ETHTOOL_PFC_PREVENTION_TOUT, /* timeout in msecs */ + ETHTOOL_TX_COPYBREAK_BUF_SIZE, /* * Add your fresh new tunable attribute above and remember to update * tunable_strings[] in net/ethtool/common.c diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 24036e3055a1..408c3a667571 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -83,6 +83,7 @@ tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN] = { [ETHTOOL_RX_COPYBREAK] = "rx-copybreak", [ETHTOOL_TX_COPYBREAK] = "tx-copybreak", [ETHTOOL_PFC_PREVENTION_TOUT] = "pfc-prevention-tout", + [ETHTOOL_TX_COPYBREAK_BUF_SIZE] = "tx-copybreak-buf-size", };
const char diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 5081cf226769..4601889f7b60 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -2364,6 +2364,7 @@ static int ethtool_tunable_valid(const struct ethtool_tunable *tuna) switch (tuna->id) { case ETHTOOL_RX_COPYBREAK: case ETHTOOL_TX_COPYBREAK: + case ETHTOOL_TX_COPYBREAK_BUF_SIZE: if (tuna->len != sizeof(u32) || tuna->type_id != ETHTOOL_TUNABLE_U32) return -EINVAL;
From: Hao Chen chenhao288@hisilicon.com
mainline inclusion from mainline-master commit e445f08af2b1 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Tx copybreak buf size is used for tx copybreak feature, the feature is used for small size packet or frag. It adds a queue based tx shared bounce buffer to memcpy the small packet when the len of xmitted skb is below tx_copybreak(value to distinguish small size and normal size), and reduce the overhead of dma map and unmap when IOMMU is on.
Support setting it via ethtool --set-tunable parameter and getting it via ethtool --get-tunable parameter.
Signed-off-by: Hao Chen chenhao288@hisilicon.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../net/ethernet/hisilicon/hns3/hns3_enet.c | 4 +- .../net/ethernet/hisilicon/hns3/hns3_enet.h | 2 + .../ethernet/hisilicon/hns3/hns3_ethtool.c | 56 +++++++++++++++++++ 3 files changed, 60 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 68082f8d6ae8..cb0d5381482e 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -5531,8 +5531,8 @@ static int hns3_reset_notify_uninit_enet(struct hnae3_handle *handle) return 0; }
-static int hns3_reset_notify(struct hnae3_handle *handle, - enum hnae3_reset_notify_type type) +int hns3_reset_notify(struct hnae3_handle *handle, + enum hnae3_reset_notify_type type) { int ret = 0;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index 1715c98d906d..361a6390e159 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -705,6 +705,8 @@ void hns3_set_vector_coalesce_tx_ql(struct hns3_enet_tqp_vector *tqp_vector, u32 ql_value);
void hns3_request_update_promisc_mode(struct hnae3_handle *handle); +int hns3_reset_notify(struct hnae3_handle *handle, + enum hnae3_reset_notify_type type);
#ifdef CONFIG_HNS3_DCB void hns3_dcbnl_setup(struct hnae3_handle *handle); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index c8442b86df94..9a816dbec613 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -1695,6 +1695,7 @@ static int hns3_get_tunable(struct net_device *netdev, void *data) { struct hns3_nic_priv *priv = netdev_priv(netdev); + struct hnae3_handle *h = priv->ae_handle; int ret = 0;
switch (tuna->id) { @@ -1705,6 +1706,9 @@ static int hns3_get_tunable(struct net_device *netdev, case ETHTOOL_RX_COPYBREAK: *(u32 *)data = priv->rx_copybreak; break; + case ETHTOOL_TX_COPYBREAK_BUF_SIZE: + *(u32 *)data = h->kinfo.tx_spare_buf_size; + break; default: ret = -EOPNOTSUPP; break; @@ -1713,11 +1717,43 @@ static int hns3_get_tunable(struct net_device *netdev, return ret; }
+static int hns3_set_tx_spare_buf_size(struct net_device *netdev, + u32 data) +{ + struct hns3_nic_priv *priv = netdev_priv(netdev); + struct hnae3_handle *h = priv->ae_handle; + int ret; + + if (hns3_nic_resetting(netdev)) + return -EBUSY; + + h->kinfo.tx_spare_buf_size = data; + + ret = hns3_reset_notify(h, HNAE3_DOWN_CLIENT); + if (ret) + return ret; + + ret = hns3_reset_notify(h, HNAE3_UNINIT_CLIENT); + if (ret) + return ret; + + ret = hns3_reset_notify(h, HNAE3_INIT_CLIENT); + if (ret) + return ret; + + ret = hns3_reset_notify(h, HNAE3_UP_CLIENT); + if (ret) + hns3_reset_notify(h, HNAE3_UNINIT_CLIENT); + + return ret; +} + static int hns3_set_tunable(struct net_device *netdev, const struct ethtool_tunable *tuna, const void *data) { struct hns3_nic_priv *priv = netdev_priv(netdev); + u32 old_tx_spare_buf_size, new_tx_spare_buf_size; struct hnae3_handle *h = priv->ae_handle; int i, ret = 0;
@@ -1735,6 +1771,26 @@ static int hns3_set_tunable(struct net_device *netdev, for (i = h->kinfo.num_tqps; i < h->kinfo.num_tqps * 2; i++) priv->ring[i].rx_copybreak = priv->rx_copybreak;
+ break; + case ETHTOOL_TX_COPYBREAK_BUF_SIZE: + old_tx_spare_buf_size = h->kinfo.tx_spare_buf_size; + new_tx_spare_buf_size = *(u32 *)data; + ret = hns3_set_tx_spare_buf_size(netdev, new_tx_spare_buf_size); + if (ret) { + int ret1; + + netdev_warn(netdev, + "change tx spare buf size fail, revert to old value\n"); + ret1 = hns3_set_tx_spare_buf_size(netdev, + old_tx_spare_buf_size); + if (ret1) { + netdev_err(netdev, + "revert to old tx spare buf size fail\n"); + return ret1; + } + + return ret; + } break; default: ret = -EOPNOTSUPP;
From: Hao Chen chenhao288@hisilicon.com
mainline inclusion from mainline-master commit 0b70c256eba8 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Add support to set rx buf len via ethtool -G parameter and get rx buf len via ethtool -g parameter.
Signed-off-by: Hao Chen chenhao288@hisilicon.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com (fix conflicts: remove extend link modes) Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- Documentation/networking/ethtool-netlink.rst | 10 +++++---- include/linux/ethtool.h | 18 +++++++++++++++ include/uapi/linux/ethtool_netlink.h | 1 + net/ethtool/netlink.h | 2 +- net/ethtool/rings.c | 23 ++++++++++++++++++-- 5 files changed, 47 insertions(+), 7 deletions(-)
diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index dd8c30c51ce0..8ada76198b54 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -820,7 +820,7 @@ Request contents:
Kernel response contents:
- ==================================== ====== ========================== + ==================================== ====== =========================== ``ETHTOOL_A_RINGS_HEADER`` nested reply header ``ETHTOOL_A_RINGS_RX_MAX`` u32 max size of RX ring ``ETHTOOL_A_RINGS_RX_MINI_MAX`` u32 max size of RX mini ring @@ -830,7 +830,8 @@ Kernel response contents: ``ETHTOOL_A_RINGS_RX_MINI`` u32 size of RX mini ring ``ETHTOOL_A_RINGS_RX_JUMBO`` u32 size of RX jumbo ring ``ETHTOOL_A_RINGS_TX`` u32 size of TX ring - ==================================== ====== ========================== + ``ETHTOOL_A_RINGS_RX_BUF_LEN`` u32 size of buffers on the ring + ==================================== ====== ===========================
RINGS_SET @@ -840,13 +841,14 @@ Sets ring sizes like ``ETHTOOL_SRINGPARAM`` ioctl request.
Request contents:
- ==================================== ====== ========================== + ==================================== ====== =========================== ``ETHTOOL_A_RINGS_HEADER`` nested reply header ``ETHTOOL_A_RINGS_RX`` u32 size of RX ring ``ETHTOOL_A_RINGS_RX_MINI`` u32 size of RX mini ring ``ETHTOOL_A_RINGS_RX_JUMBO`` u32 size of RX jumbo ring ``ETHTOOL_A_RINGS_TX`` u32 size of TX ring - ==================================== ====== ========================== + ``ETHTOOL_A_RINGS_RX_BUF_LEN`` u32 size of buffers on the ring + ==================================== ====== ===========================
Kernel checks that requested ring sizes do not exceed limits reported by driver. Driver may impose additional constraints and may not suspport all diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index f1fbb9595839..0bce31a7a508 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -67,6 +67,22 @@ enum { ETH_RSS_HASH_FUNCS_COUNT };
+/** + * struct kernel_ethtool_ringparam - RX/TX ring configuration + * @rx_buf_len: Current length of buffers on the rx ring. + */ +struct kernel_ethtool_ringparam { + u32 rx_buf_len; +}; + +/** + * enum ethtool_supported_ring_param - indicator caps for setting ring params + * @ETHTOOL_RING_USE_RX_BUF_LEN: capture for setting rx_buf_len + */ +enum ethtool_supported_ring_param { + ETHTOOL_RING_USE_RX_BUF_LEN = BIT(0), +}; + #define __ETH_RSS_HASH_BIT(bit) ((u32)1 << (bit)) #define __ETH_RSS_HASH(name) __ETH_RSS_HASH_BIT(ETH_RSS_HASH_##name##_BIT)
@@ -272,6 +288,7 @@ struct ethtool_pause_stats { /** * struct ethtool_ops - optional netdev operations * @supported_coalesce_params: supported types of interrupt coalescing. + * @supported_ring_params: supported ring params. * @get_drvinfo: Report driver/device information. Should only set the * @driver, @version, @fw_version and @bus_info fields. If not * implemented, the @driver and @bus_info fields will be filled in @@ -427,6 +444,7 @@ struct ethtool_pause_stats { */ struct ethtool_ops { u32 supported_coalesce_params; + u32 supported_ring_params; void (*get_drvinfo)(struct net_device *, struct ethtool_drvinfo *); int (*get_regs_len)(struct net_device *); void (*get_regs)(struct net_device *, struct ethtool_regs *, void *); diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 28165c92806e..f2d2ac2a3295 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -314,6 +314,7 @@ enum { ETHTOOL_A_RINGS_RX_MINI, /* u32 */ ETHTOOL_A_RINGS_RX_JUMBO, /* u32 */ ETHTOOL_A_RINGS_TX, /* u32 */ + ETHTOOL_A_RINGS_RX_BUF_LEN, /* u32 */
/* add new constants above here */ __ETHTOOL_A_RINGS_CNT, diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 5e35ec53c036..6c16bd88916b 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -362,7 +362,7 @@ extern const struct nla_policy ethnl_features_set_policy[ETHTOOL_A_FEATURES_WANT extern const struct nla_policy ethnl_privflags_get_policy[ETHTOOL_A_PRIVFLAGS_HEADER + 1]; extern const struct nla_policy ethnl_privflags_set_policy[ETHTOOL_A_PRIVFLAGS_FLAGS + 1]; extern const struct nla_policy ethnl_rings_get_policy[ETHTOOL_A_RINGS_HEADER + 1]; -extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_TX + 1]; +extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_RX_BUF_LEN + 1]; extern const struct nla_policy ethnl_channels_get_policy[ETHTOOL_A_CHANNELS_HEADER + 1]; extern const struct nla_policy ethnl_channels_set_policy[ETHTOOL_A_CHANNELS_COMBINED_COUNT + 1]; extern const struct nla_policy ethnl_coalesce_get_policy[ETHTOOL_A_COALESCE_HEADER + 1]; diff --git a/net/ethtool/rings.c b/net/ethtool/rings.c index 4e097812a967..bd8e9a7530eb 100644 --- a/net/ethtool/rings.c +++ b/net/ethtool/rings.c @@ -10,6 +10,7 @@ struct rings_req_info { struct rings_reply_data { struct ethnl_reply_data base; struct ethtool_ringparam ringparam; + struct kernel_ethtool_ringparam kernel_ringparam; };
#define RINGS_REPDATA(__reply_base) \ @@ -49,7 +50,8 @@ static int rings_reply_size(const struct ethnl_req_info *req_base, nla_total_size(sizeof(u32)) + /* _RINGS_RX */ nla_total_size(sizeof(u32)) + /* _RINGS_RX_MINI */ nla_total_size(sizeof(u32)) + /* _RINGS_RX_JUMBO */ - nla_total_size(sizeof(u32)); /* _RINGS_TX */ + nla_total_size(sizeof(u32)) + /* _RINGS_TX */ + nla_total_size(sizeof(u32)); /* _RINGS_RX_BUF_LEN */ }
static int rings_fill_reply(struct sk_buff *skb, @@ -57,6 +59,7 @@ static int rings_fill_reply(struct sk_buff *skb, const struct ethnl_reply_data *reply_base) { const struct rings_reply_data *data = RINGS_REPDATA(reply_base); + const struct kernel_ethtool_ringparam *kernel_ringparam = &data->kernel_ringparam; const struct ethtool_ringparam *ringparam = &data->ringparam;
if ((ringparam->rx_max_pending && @@ -78,7 +81,10 @@ static int rings_fill_reply(struct sk_buff *skb, (nla_put_u32(skb, ETHTOOL_A_RINGS_TX_MAX, ringparam->tx_max_pending) || nla_put_u32(skb, ETHTOOL_A_RINGS_TX, - ringparam->tx_pending)))) + ringparam->tx_pending))) || + (kernel_ringparam->rx_buf_len && + (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_BUF_LEN, + kernel_ringparam->rx_buf_len)))) return -EMSGSIZE;
return 0; @@ -105,10 +111,12 @@ const struct nla_policy ethnl_rings_set_policy[] = { [ETHTOOL_A_RINGS_RX_MINI] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_RX_JUMBO] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_TX] = { .type = NLA_U32 }, + [ETHTOOL_A_RINGS_RX_BUF_LEN] = NLA_POLICY_MIN(NLA_U32, 1), };
int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info) { + struct kernel_ethtool_ringparam kernel_ringparam = {}; struct ethtool_ringparam ringparam = {}; struct ethnl_req_info req_info = {}; struct nlattr **tb = info->attrs; @@ -142,6 +150,8 @@ int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info) ethnl_update_u32(&ringparam.rx_jumbo_pending, tb[ETHTOOL_A_RINGS_RX_JUMBO], &mod); ethnl_update_u32(&ringparam.tx_pending, tb[ETHTOOL_A_RINGS_TX], &mod); + ethnl_update_u32(&kernel_ringparam.rx_buf_len, + tb[ETHTOOL_A_RINGS_RX_BUF_LEN], &mod); ret = 0; if (!mod) goto out_ops; @@ -164,6 +174,15 @@ int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info) goto out_ops; }
+ if (kernel_ringparam.rx_buf_len != 0 && + !(ops->supported_ring_params & ETHTOOL_RING_USE_RX_BUF_LEN)) { + ret = -EOPNOTSUPP; + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_RINGS_RX_BUF_LEN], + "setting rx buf len not supported"); + goto out_ops; + } + ret = dev->ethtool_ops->set_ringparam(dev, &ringparam); if (ret < 0) goto out_ops;
From: Hao Chen chenhao288@hisilicon.com
mainline inclusion from mainline-master commit 7462494408cd category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Add two new parameters kernel_ringparam and extack for .get_ringparam and .set_ringparam to extend more ring params through netlink.
Signed-off-by: Hao Chen chenhao288@hisilicon.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com (fix conflicts: remove unsupported ethernets and ops) (fix conflicts: adapt spnic and txgbe for interface change) Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/um/drivers/vector_kern.c | 4 ++- drivers/net/ethernet/3com/typhoon.c | 4 ++- drivers/net/ethernet/amazon/ena/ena_ethtool.c | 8 ++++-- drivers/net/ethernet/amd/pcnet32.c | 8 ++++-- drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c | 11 +++++--- .../ethernet/aquantia/atlantic/aq_ethtool.c | 8 ++++-- drivers/net/ethernet/atheros/atlx/atl1.c | 8 ++++-- drivers/net/ethernet/broadcom/b44.c | 8 ++++-- drivers/net/ethernet/broadcom/bcm63xx_enet.c | 25 +++++++++++++------ drivers/net/ethernet/broadcom/bnx2.c | 8 ++++-- .../ethernet/broadcom/bnx2x/bnx2x_ethtool.c | 8 ++++-- .../net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 8 ++++-- drivers/net/ethernet/broadcom/tg3.c | 10 ++++++-- .../net/ethernet/brocade/bna/bnad_ethtool.c | 8 ++++-- drivers/net/ethernet/cadence/macb_main.c | 8 ++++-- .../ethernet/cavium/liquidio/lio_ethtool.c | 11 +++++--- .../ethernet/cavium/thunder/nicvf_ethtool.c | 8 ++++-- drivers/net/ethernet/chelsio/cxgb/cxgb2.c | 8 ++++-- .../net/ethernet/chelsio/cxgb3/cxgb3_main.c | 8 ++++-- .../ethernet/chelsio/cxgb4/cxgb4_ethtool.c | 8 ++++-- .../ethernet/chelsio/cxgb4vf/cxgb4vf_main.c | 8 ++++-- .../net/ethernet/cisco/enic/enic_ethtool.c | 8 ++++-- drivers/net/ethernet/cortina/gemini.c | 8 ++++-- .../net/ethernet/emulex/benet/be_ethtool.c | 4 ++- drivers/net/ethernet/ethoc.c | 8 ++++-- drivers/net/ethernet/faraday/ftgmac100.c | 14 ++++++++--- .../ethernet/freescale/enetc/enetc_ethtool.c | 4 ++- .../net/ethernet/freescale/gianfar_ethtool.c | 8 ++++-- .../net/ethernet/freescale/ucc_geth_ethtool.c | 8 ++++-- drivers/net/ethernet/google/gve/gve_ethtool.c | 4 ++- .../net/ethernet/hisilicon/hns/hns_ethtool.c | 6 ++++- .../ethernet/hisilicon/hns3/hns3_ethtool.c | 8 ++++-- .../net/ethernet/huawei/hinic/hinic_ethtool.c | 8 ++++-- drivers/net/ethernet/ibm/emac/core.c | 7 ++++-- drivers/net/ethernet/ibm/ibmvnic.c | 8 ++++-- drivers/net/ethernet/intel/e100.c | 8 ++++-- .../net/ethernet/intel/e1000/e1000_ethtool.c | 8 ++++-- drivers/net/ethernet/intel/e1000e/ethtool.c | 8 ++++-- .../net/ethernet/intel/fm10k/fm10k_ethtool.c | 8 ++++-- .../net/ethernet/intel/i40e/i40e_ethtool.c | 8 ++++-- .../net/ethernet/intel/iavf/iavf_ethtool.c | 12 +++++++-- drivers/net/ethernet/intel/ice/ice_ethtool.c | 8 ++++-- drivers/net/ethernet/intel/igb/igb_ethtool.c | 8 ++++-- drivers/net/ethernet/intel/igbvf/ethtool.c | 8 ++++-- drivers/net/ethernet/intel/igc/igc_ethtool.c | 14 ++++++++--- .../net/ethernet/intel/ixgb/ixgb_ethtool.c | 8 ++++-- .../net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 8 ++++-- drivers/net/ethernet/intel/ixgbevf/ethtool.c | 8 ++++-- drivers/net/ethernet/marvell/mv643xx_eth.c | 8 ++++-- drivers/net/ethernet/marvell/mvneta.c | 14 ++++++++--- .../net/ethernet/marvell/mvpp2/mvpp2_main.c | 14 ++++++++--- .../marvell/octeontx2/nic/otx2_ethtool.c | 8 ++++-- drivers/net/ethernet/marvell/skge.c | 8 ++++-- drivers/net/ethernet/marvell/sky2.c | 8 ++++-- .../net/ethernet/mellanox/mlx4/en_ethtool.c | 8 ++++-- .../ethernet/mellanox/mlx5/core/en_ethtool.c | 8 ++++-- .../net/ethernet/mellanox/mlx5/core/en_rep.c | 14 ++++++++--- .../mellanox/mlx5/core/ipoib/ethtool.c | 8 ++++-- drivers/net/ethernet/micrel/ksz884x.c | 6 ++++- .../net/ethernet/myricom/myri10ge/myri10ge.c | 4 ++- drivers/net/ethernet/neterion/s2io.c | 7 ++++-- .../ethernet/netronome/nfp/nfp_net_ethtool.c | 8 ++++-- .../ethernet/netswift/txgbe/txgbe_ethtool.c | 8 ++++-- drivers/net/ethernet/nvidia/forcedeth.c | 10 ++++++-- .../oki-semi/pch_gbe/pch_gbe_ethtool.c | 12 +++++++-- .../net/ethernet/pasemi/pasemi_mac_ethtool.c | 4 ++- .../ethernet/pensando/ionic/ionic_ethtool.c | 8 ++++-- .../qlogic/netxen/netxen_nic_ethtool.c | 8 ++++-- .../net/ethernet/qlogic/qede/qede_ethtool.c | 8 ++++-- .../ethernet/qlogic/qlcnic/qlcnic_ethtool.c | 8 ++++-- .../net/ethernet/qualcomm/emac/emac-ethtool.c | 8 ++++-- drivers/net/ethernet/qualcomm/qca_debug.c | 8 ++++-- .../ethernet/ramaxel/spnic/spnic_ethtool.c | 8 ++++-- drivers/net/ethernet/realtek/8139cp.c | 4 ++- drivers/net/ethernet/renesas/ravb_main.c | 8 ++++-- drivers/net/ethernet/renesas/sh_eth.c | 8 ++++-- drivers/net/ethernet/sfc/ef100_ethtool.c | 7 ++++-- drivers/net/ethernet/sfc/ethtool.c | 14 ++++++++--- drivers/net/ethernet/sfc/falcon/ethtool.c | 14 ++++++++--- .../ethernet/stmicro/stmmac/stmmac_ethtool.c | 8 ++++-- drivers/net/ethernet/tehuti/tehuti.c | 12 +++++++-- drivers/net/ethernet/ti/am65-cpsw-ethtool.c | 7 ++++-- drivers/net/ethernet/ti/cpmac.c | 8 ++++-- drivers/net/ethernet/ti/cpsw_ethtool.c | 8 ++++-- drivers/net/ethernet/ti/cpsw_priv.h | 8 ++++-- .../net/ethernet/toshiba/spider_net_ethtool.c | 4 ++- drivers/net/ethernet/xilinx/ll_temac_main.c | 14 ++++++++--- .../net/ethernet/xilinx/xilinx_axienet_main.c | 14 ++++++++--- drivers/net/hyperv/netvsc_drv.c | 8 ++++-- drivers/net/usb/r8152.c | 8 ++++-- drivers/net/virtio_net.c | 4 ++- drivers/net/vmxnet3/vmxnet3_ethtool.c | 10 +++++--- drivers/s390/net/qeth_ethtool.c | 4 ++- include/linux/ethtool.h | 8 ++++-- net/ethtool/ioctl.c | 10 +++++--- net/ethtool/rings.c | 9 ++++--- net/mac80211/ethtool.c | 8 ++++-- 97 files changed, 612 insertions(+), 210 deletions(-)
diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c index 555203e3e7b4..8a155067930b 100644 --- a/arch/um/drivers/vector_kern.c +++ b/arch/um/drivers/vector_kern.c @@ -1442,7 +1442,9 @@ static int vector_net_load_bpf_flash(struct net_device *dev, }
static void vector_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct vector_private *vp = netdev_priv(netdev);
diff --git a/drivers/net/ethernet/3com/typhoon.c b/drivers/net/ethernet/3com/typhoon.c index 05e15b6e5e2c..481f1df3106c 100644 --- a/drivers/net/ethernet/3com/typhoon.c +++ b/drivers/net/ethernet/3com/typhoon.c @@ -1138,7 +1138,9 @@ typhoon_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol) }
static void -typhoon_get_ringparam(struct net_device *dev, struct ethtool_ringparam *ering) +typhoon_get_ringparam(struct net_device *dev, struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { ering->rx_max_pending = RXENT_ENTRIES; ering->tx_max_pending = TXLO_ENTRIES - 1; diff --git a/drivers/net/ethernet/amazon/ena/ena_ethtool.c b/drivers/net/ethernet/amazon/ena/ena_ethtool.c index 7bae8261bb90..06e576140e66 100644 --- a/drivers/net/ethernet/amazon/ena/ena_ethtool.c +++ b/drivers/net/ethernet/amazon/ena/ena_ethtool.c @@ -462,7 +462,9 @@ static void ena_get_drvinfo(struct net_device *dev, }
static void ena_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct ena_adapter *adapter = netdev_priv(netdev);
@@ -473,7 +475,9 @@ static void ena_get_ringparam(struct net_device *netdev, }
static int ena_set_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct ena_adapter *adapter = netdev_priv(netdev); u32 new_tx_size, new_rx_size; diff --git a/drivers/net/ethernet/amd/pcnet32.c b/drivers/net/ethernet/amd/pcnet32.c index f78daba60b35..e049225ba87a 100644 --- a/drivers/net/ethernet/amd/pcnet32.c +++ b/drivers/net/ethernet/amd/pcnet32.c @@ -860,7 +860,9 @@ static int pcnet32_nway_reset(struct net_device *dev) }
static void pcnet32_get_ringparam(struct net_device *dev, - struct ethtool_ringparam *ering) + struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct pcnet32_private *lp = netdev_priv(dev);
@@ -871,7 +873,9 @@ static void pcnet32_get_ringparam(struct net_device *dev, }
static int pcnet32_set_ringparam(struct net_device *dev, - struct ethtool_ringparam *ering) + struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct pcnet32_private *lp = netdev_priv(dev); unsigned long flags; diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c index bafc51c34e0b..ac70b99ae568 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c @@ -621,8 +621,11 @@ static int xgbe_get_module_eeprom(struct net_device *netdev, return pdata->phy_if.module_eeprom(pdata, eeprom, data); }
-static void xgbe_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ringparam) +static void +xgbe_get_ringparam(struct net_device *netdev, + struct ethtool_ringparam *ringparam, + struct kernel_ethtool_ringparam *kernel_ringparam, + struct netlink_ext_ack *extack) { struct xgbe_prv_data *pdata = netdev_priv(netdev);
@@ -633,7 +636,9 @@ static void xgbe_get_ringparam(struct net_device *netdev, }
static int xgbe_set_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ringparam) + struct ethtool_ringparam *ringparam, + struct kernel_ethtool_ringparam *kernel_ringparam, + struct netlink_ext_ack *extack) { struct xgbe_prv_data *pdata = netdev_priv(netdev); unsigned int rx, tx; diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c b/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c index a9ef0544e30f..a418238f6309 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c @@ -812,7 +812,9 @@ static int aq_ethtool_set_pauseparam(struct net_device *ndev, }
static void aq_get_ringparam(struct net_device *ndev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct aq_nic_s *aq_nic = netdev_priv(ndev); struct aq_nic_cfg_s *cfg; @@ -827,7 +829,9 @@ static void aq_get_ringparam(struct net_device *ndev, }
static int aq_set_ringparam(struct net_device *ndev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct aq_nic_s *aq_nic = netdev_priv(ndev); const struct aq_hw_caps_s *hw_caps; diff --git a/drivers/net/ethernet/atheros/atlx/atl1.c b/drivers/net/ethernet/atheros/atlx/atl1.c index eaf96d002fa5..851b8032aaaa 100644 --- a/drivers/net/ethernet/atheros/atlx/atl1.c +++ b/drivers/net/ethernet/atheros/atlx/atl1.c @@ -3438,7 +3438,9 @@ static void atl1_get_regs(struct net_device *netdev, struct ethtool_regs *regs, }
static void atl1_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct atl1_adapter *adapter = netdev_priv(netdev); struct atl1_tpd_ring *txdr = &adapter->tpd_ring; @@ -3451,7 +3453,9 @@ static void atl1_get_ringparam(struct net_device *netdev, }
static int atl1_set_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct atl1_adapter *adapter = netdev_priv(netdev); struct atl1_tpd_ring *tpdr = &adapter->tpd_ring; diff --git a/drivers/net/ethernet/broadcom/b44.c b/drivers/net/ethernet/broadcom/b44.c index b455b60a5434..fce6f46288b4 100644 --- a/drivers/net/ethernet/broadcom/b44.c +++ b/drivers/net/ethernet/broadcom/b44.c @@ -1959,7 +1959,9 @@ static int b44_set_link_ksettings(struct net_device *dev, }
static void b44_get_ringparam(struct net_device *dev, - struct ethtool_ringparam *ering) + struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct b44 *bp = netdev_priv(dev);
@@ -1970,7 +1972,9 @@ static void b44_get_ringparam(struct net_device *dev, }
static int b44_set_ringparam(struct net_device *dev, - struct ethtool_ringparam *ering) + struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct b44 *bp = netdev_priv(dev);
diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c b/drivers/net/ethernet/broadcom/bcm63xx_enet.c index 916824cca3fd..a2afc4672ea5 100644 --- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c +++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c @@ -1478,8 +1478,11 @@ static int bcm_enet_set_link_ksettings(struct net_device *dev, } }
-static void bcm_enet_get_ringparam(struct net_device *dev, - struct ethtool_ringparam *ering) +static void +bcm_enet_get_ringparam(struct net_device *dev, + struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct bcm_enet_priv *priv;
@@ -1493,7 +1496,9 @@ static void bcm_enet_get_ringparam(struct net_device *dev, }
static int bcm_enet_set_ringparam(struct net_device *dev, - struct ethtool_ringparam *ering) + struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct bcm_enet_priv *priv; int was_running; @@ -2576,8 +2581,11 @@ static void bcm_enetsw_get_ethtool_stats(struct net_device *netdev, } }
-static void bcm_enetsw_get_ringparam(struct net_device *dev, - struct ethtool_ringparam *ering) +static void +bcm_enetsw_get_ringparam(struct net_device *dev, + struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct bcm_enet_priv *priv;
@@ -2592,8 +2600,11 @@ static void bcm_enetsw_get_ringparam(struct net_device *dev, ering->tx_pending = priv->tx_ring_size; }
-static int bcm_enetsw_set_ringparam(struct net_device *dev, - struct ethtool_ringparam *ering) +static int +bcm_enetsw_set_ringparam(struct net_device *dev, + struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct bcm_enet_priv *priv; int was_running; diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c index c749ee375f40..948f24fb4b6e 100644 --- a/drivers/net/ethernet/broadcom/bnx2.c +++ b/drivers/net/ethernet/broadcom/bnx2.c @@ -7317,7 +7317,9 @@ static int bnx2_set_coalesce(struct net_device *dev, }
static void -bnx2_get_ringparam(struct net_device *dev, struct ethtool_ringparam *ering) +bnx2_get_ringparam(struct net_device *dev, struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct bnx2 *bp = netdev_priv(dev);
@@ -7388,7 +7390,9 @@ bnx2_change_ring_size(struct bnx2 *bp, u32 rx, u32 tx, bool reset_irq) }
static int -bnx2_set_ringparam(struct net_device *dev, struct ethtool_ringparam *ering) +bnx2_set_ringparam(struct net_device *dev, struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct bnx2 *bp = netdev_priv(dev); int rc; diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c index 472a3a478038..0e319ac7799f 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c @@ -1914,7 +1914,9 @@ static int bnx2x_set_coalesce(struct net_device *dev, }
static void bnx2x_get_ringparam(struct net_device *dev, - struct ethtool_ringparam *ering) + struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct bnx2x *bp = netdev_priv(dev);
@@ -1938,7 +1940,9 @@ static void bnx2x_get_ringparam(struct net_device *dev, }
static int bnx2x_set_ringparam(struct net_device *dev, - struct ethtool_ringparam *ering) + struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct bnx2x *bp = netdev_priv(dev);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 5c597229b3eb..f12a01ba86cf 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -764,7 +764,9 @@ static void bnxt_get_strings(struct net_device *dev, u32 stringset, u8 *buf) }
static void bnxt_get_ringparam(struct net_device *dev, - struct ethtool_ringparam *ering) + struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct bnxt *bp = netdev_priv(dev);
@@ -778,7 +780,9 @@ static void bnxt_get_ringparam(struct net_device *dev, }
static int bnxt_set_ringparam(struct net_device *dev, - struct ethtool_ringparam *ering) + struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct bnxt *bp = netdev_priv(dev);
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 85c6f627d665..1d93333f552a 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -12405,7 +12405,10 @@ static int tg3_nway_reset(struct net_device *dev) return r; }
-static void tg3_get_ringparam(struct net_device *dev, struct ethtool_ringparam *ering) +static void tg3_get_ringparam(struct net_device *dev, + struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct tg3 *tp = netdev_priv(dev);
@@ -12426,7 +12429,10 @@ static void tg3_get_ringparam(struct net_device *dev, struct ethtool_ringparam * ering->tx_pending = tp->napi[0].tx_pending; }
-static int tg3_set_ringparam(struct net_device *dev, struct ethtool_ringparam *ering) +static int tg3_set_ringparam(struct net_device *dev, + struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct tg3 *tp = netdev_priv(dev); int i, irq_sync = 0, err = 0; diff --git a/drivers/net/ethernet/brocade/bna/bnad_ethtool.c b/drivers/net/ethernet/brocade/bna/bnad_ethtool.c index 916bde53f13d..e4a024f59328 100644 --- a/drivers/net/ethernet/brocade/bna/bnad_ethtool.c +++ b/drivers/net/ethernet/brocade/bna/bnad_ethtool.c @@ -405,7 +405,9 @@ static int bnad_set_coalesce(struct net_device *netdev,
static void bnad_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ringparam) + struct ethtool_ringparam *ringparam, + struct kernel_ethtool_ringparam *kernel_ringparam, + struct netlink_ext_ack *extack) { struct bnad *bnad = netdev_priv(netdev);
@@ -418,7 +420,9 @@ bnad_get_ringparam(struct net_device *netdev,
static int bnad_set_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ringparam) + struct ethtool_ringparam *ringparam, + struct kernel_ethtool_ringparam *kernel_ringparam, + struct netlink_ext_ack *extack) { int i, current_err, err = 0; struct bnad *bnad = netdev_priv(netdev); diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 1e8bf6b9834b..f2ec92b6123d 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -2935,7 +2935,9 @@ static int macb_set_link_ksettings(struct net_device *netdev, }
static void macb_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct macb *bp = netdev_priv(netdev);
@@ -2947,7 +2949,9 @@ static void macb_get_ringparam(struct net_device *netdev, }
static int macb_set_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct macb *bp = netdev_priv(netdev); u32 new_rx_size, new_tx_size; diff --git a/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c b/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c index 999dc567c8ee..f49a775d3642 100644 --- a/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c +++ b/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c @@ -946,7 +946,9 @@ static int lio_set_phys_id(struct net_device *netdev,
static void lio_ethtool_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ering) + struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct lio *lio = GET_LIO(netdev); struct octeon_device *oct = lio->oct_dev; @@ -1251,8 +1253,11 @@ static int lio_reset_queues(struct net_device *netdev, uint32_t num_qs) return 0; }
-static int lio_ethtool_set_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ering) +static int +lio_ethtool_set_ringparam(struct net_device *netdev, + struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { u32 rx_count, tx_count, rx_count_old, tx_count_old; struct lio *lio = GET_LIO(netdev); diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c b/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c index f794273df95a..9002cdea4084 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c @@ -466,7 +466,9 @@ static int nicvf_get_coalesce(struct net_device *netdev, }
static void nicvf_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct nicvf *nic = netdev_priv(netdev); struct queue_set *qs = nic->qs; @@ -478,7 +480,9 @@ static void nicvf_get_ringparam(struct net_device *netdev, }
static int nicvf_set_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct nicvf *nic = netdev_priv(netdev); struct queue_set *qs = nic->qs; diff --git a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c index 2c5495fa64aa..1311eac9eef2 100644 --- a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c +++ b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c @@ -709,7 +709,9 @@ static int set_pauseparam(struct net_device *dev, return 0; }
-static void get_sge_param(struct net_device *dev, struct ethtool_ringparam *e) +static void get_sge_param(struct net_device *dev, struct ethtool_ringparam *e, + struct kernel_ethtool_ringparam *kernel_e, + struct netlink_ext_ack *extack) { struct adapter *adapter = dev->ml_priv; int jumbo_fl = t1_is_T1B(adapter) ? 1 : 0; @@ -723,7 +725,9 @@ static void get_sge_param(struct net_device *dev, struct ethtool_ringparam *e) e->tx_pending = adapter->params.sge.cmdQ_size[0]; }
-static int set_sge_param(struct net_device *dev, struct ethtool_ringparam *e) +static int set_sge_param(struct net_device *dev, struct ethtool_ringparam *e, + struct kernel_ethtool_ringparam *kernel_e, + struct netlink_ext_ack *extack) { struct adapter *adapter = dev->ml_priv; int jumbo_fl = t1_is_T1B(adapter) ? 1 : 0; diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c index c22ddbd20664..9ac0cab9c967 100644 --- a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c +++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c @@ -1948,7 +1948,9 @@ static int set_pauseparam(struct net_device *dev, return 0; }
-static void get_sge_param(struct net_device *dev, struct ethtool_ringparam *e) +static void get_sge_param(struct net_device *dev, struct ethtool_ringparam *e, + struct kernel_ethtool_ringparam *kernel_e, + struct netlink_ext_ack *extack) { struct port_info *pi = netdev_priv(dev); struct adapter *adapter = pi->adapter; @@ -1964,7 +1966,9 @@ static void get_sge_param(struct net_device *dev, struct ethtool_ringparam *e) e->tx_pending = q->txq_size[0]; }
-static int set_sge_param(struct net_device *dev, struct ethtool_ringparam *e) +static int set_sge_param(struct net_device *dev, struct ethtool_ringparam *e, + struct kernel_ethtool_ringparam *kernel_e, + struct netlink_ext_ack *extack) { struct port_info *pi = netdev_priv(dev); struct adapter *adapter = pi->adapter; diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c index 129352bbe114..0e4ec4079741 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c @@ -890,7 +890,9 @@ static int set_pauseparam(struct net_device *dev, return 0; }
-static void get_sge_param(struct net_device *dev, struct ethtool_ringparam *e) +static void get_sge_param(struct net_device *dev, struct ethtool_ringparam *e, + struct kernel_ethtool_ringparam *kernel_e, + struct netlink_ext_ack *extack) { const struct port_info *pi = netdev_priv(dev); const struct sge *s = &pi->adapter->sge; @@ -906,7 +908,9 @@ static void get_sge_param(struct net_device *dev, struct ethtool_ringparam *e) e->tx_pending = s->ethtxq[pi->first_qset].q.size; }
-static int set_sge_param(struct net_device *dev, struct ethtool_ringparam *e) +static int set_sge_param(struct net_device *dev, struct ethtool_ringparam *e, + struct kernel_ethtool_ringparam *kernel_e, + struct netlink_ext_ack *extack) { int i; const struct port_info *pi = netdev_priv(dev); diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c index fabe4eb88842..d1c5ea84c9a0 100644 --- a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c @@ -1591,7 +1591,9 @@ static void cxgb4vf_set_msglevel(struct net_device *dev, u32 msglevel) * first Queue Set. */ static void cxgb4vf_get_ringparam(struct net_device *dev, - struct ethtool_ringparam *rp) + struct ethtool_ringparam *rp, + struct kernel_ethtool_ringparam *kernel_rp, + struct netlink_ext_ack *extack) { const struct port_info *pi = netdev_priv(dev); const struct sge *s = &pi->adapter->sge; @@ -1614,7 +1616,9 @@ static void cxgb4vf_get_ringparam(struct net_device *dev, * device -- after vetting them of course! */ static int cxgb4vf_set_ringparam(struct net_device *dev, - struct ethtool_ringparam *rp) + struct ethtool_ringparam *rp, + struct kernel_ethtool_ringparam *kernel_rp, + struct netlink_ext_ack *extack) { const struct port_info *pi = netdev_priv(dev); struct adapter *adapter = pi->adapter; diff --git a/drivers/net/ethernet/cisco/enic/enic_ethtool.c b/drivers/net/ethernet/cisco/enic/enic_ethtool.c index 12ffc14fbecd..fa1da1444d23 100644 --- a/drivers/net/ethernet/cisco/enic/enic_ethtool.c +++ b/drivers/net/ethernet/cisco/enic/enic_ethtool.c @@ -177,7 +177,9 @@ static void enic_get_strings(struct net_device *netdev, u32 stringset, }
static void enic_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct enic *enic = netdev_priv(netdev); struct vnic_enet_config *c = &enic->config; @@ -189,7 +191,9 @@ static void enic_get_ringparam(struct net_device *netdev, }
static int enic_set_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct enic *enic = netdev_priv(netdev); struct vnic_enet_config *c = &enic->config; diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c index 75ea23d42055..b08029245ce8 100644 --- a/drivers/net/ethernet/cortina/gemini.c +++ b/drivers/net/ethernet/cortina/gemini.c @@ -2105,7 +2105,9 @@ static void gmac_get_pauseparam(struct net_device *netdev, }
static void gmac_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *rp) + struct ethtool_ringparam *rp, + struct kernel_ethtool_ringparam *kernel_rp, + struct netlink_ext_ack *extack) { struct gemini_ethernet_port *port = netdev_priv(netdev);
@@ -2123,7 +2125,9 @@ static void gmac_get_ringparam(struct net_device *netdev, }
static int gmac_set_ringparam(struct net_device *netdev, - struct ethtool_ringparam *rp) + struct ethtool_ringparam *rp, + struct kernel_ethtool_ringparam *kernel_rp, + struct netlink_ext_ack *extack) { struct gemini_ethernet_port *port = netdev_priv(netdev); int err = 0; diff --git a/drivers/net/ethernet/emulex/benet/be_ethtool.c b/drivers/net/ethernet/emulex/benet/be_ethtool.c index f9955308b93d..dfa784339781 100644 --- a/drivers/net/ethernet/emulex/benet/be_ethtool.c +++ b/drivers/net/ethernet/emulex/benet/be_ethtool.c @@ -683,7 +683,9 @@ static int be_get_link_ksettings(struct net_device *netdev, }
static void be_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct be_adapter *adapter = netdev_priv(netdev);
diff --git a/drivers/net/ethernet/ethoc.c b/drivers/net/ethernet/ethoc.c index 3d9b0b161e24..323913cbb837 100644 --- a/drivers/net/ethernet/ethoc.c +++ b/drivers/net/ethernet/ethoc.c @@ -949,7 +949,9 @@ static void ethoc_get_regs(struct net_device *dev, struct ethtool_regs *regs, }
static void ethoc_get_ringparam(struct net_device *dev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct ethoc *priv = netdev_priv(dev);
@@ -965,7 +967,9 @@ static void ethoc_get_ringparam(struct net_device *dev, }
static int ethoc_set_ringparam(struct net_device *dev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct ethoc *priv = netdev_priv(dev);
diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c index 5bc11d1bb9df..0328813b19fd 100644 --- a/drivers/net/ethernet/faraday/ftgmac100.c +++ b/drivers/net/ethernet/faraday/ftgmac100.c @@ -1152,8 +1152,11 @@ static void ftgmac100_get_drvinfo(struct net_device *netdev, strlcpy(info->bus_info, dev_name(&netdev->dev), sizeof(info->bus_info)); }
-static void ftgmac100_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ering) +static void +ftgmac100_get_ringparam(struct net_device *netdev, + struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct ftgmac100 *priv = netdev_priv(netdev);
@@ -1164,8 +1167,11 @@ static void ftgmac100_get_ringparam(struct net_device *netdev, ering->tx_pending = priv->tx_q_entries; }
-static int ftgmac100_set_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ering) +static int +ftgmac100_set_ringparam(struct net_device *netdev, + struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct ftgmac100 *priv = netdev_priv(netdev);
diff --git a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c index a5076d5e12ea..37f9554618e9 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c @@ -545,7 +545,9 @@ static int enetc_set_rxfh(struct net_device *ndev, const u32 *indir, }
static void enetc_get_ringparam(struct net_device *ndev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct enetc_ndev_priv *priv = netdev_priv(ndev);
diff --git a/drivers/net/ethernet/freescale/gianfar_ethtool.c b/drivers/net/ethernet/freescale/gianfar_ethtool.c index 7b32ed29bf4c..ff756265d58f 100644 --- a/drivers/net/ethernet/freescale/gianfar_ethtool.c +++ b/drivers/net/ethernet/freescale/gianfar_ethtool.c @@ -372,7 +372,9 @@ static int gfar_scoalesce(struct net_device *dev, * rx, rx_mini, and rx_jumbo rings are the same size, as mini and * jumbo are ignored by the driver */ static void gfar_gringparam(struct net_device *dev, - struct ethtool_ringparam *rvals) + struct ethtool_ringparam *rvals, + struct kernel_ethtool_ringparam *kernel_rvals, + struct netlink_ext_ack *extack) { struct gfar_private *priv = netdev_priv(dev); struct gfar_priv_tx_q *tx_queue = NULL; @@ -399,7 +401,9 @@ static void gfar_gringparam(struct net_device *dev, * necessary so that we don't mess things up while we're in motion. */ static int gfar_sringparam(struct net_device *dev, - struct ethtool_ringparam *rvals) + struct ethtool_ringparam *rvals, + struct kernel_ethtool_ringparam *kernel_rvals, + struct netlink_ext_ack *extack) { struct gfar_private *priv = netdev_priv(dev); int err = 0, i; diff --git a/drivers/net/ethernet/freescale/ucc_geth_ethtool.c b/drivers/net/ethernet/freescale/ucc_geth_ethtool.c index 14c08a868190..69b2b98b1525 100644 --- a/drivers/net/ethernet/freescale/ucc_geth_ethtool.c +++ b/drivers/net/ethernet/freescale/ucc_geth_ethtool.c @@ -207,7 +207,9 @@ uec_get_regs(struct net_device *netdev,
static void uec_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct ucc_geth_private *ugeth = netdev_priv(netdev); struct ucc_geth_info *ug_info = ugeth->ug_info; @@ -226,7 +228,9 @@ uec_get_ringparam(struct net_device *netdev,
static int uec_set_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct ucc_geth_private *ugeth = netdev_priv(netdev); struct ucc_geth_info *ug_info = ugeth->ug_info; diff --git a/drivers/net/ethernet/google/gve/gve_ethtool.c b/drivers/net/ethernet/google/gve/gve_ethtool.c index 7b44769bd87c..66f9b377f8e7 100644 --- a/drivers/net/ethernet/google/gve/gve_ethtool.c +++ b/drivers/net/ethernet/google/gve/gve_ethtool.c @@ -405,7 +405,9 @@ static int gve_set_channels(struct net_device *netdev, }
static void gve_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *cmd) + struct ethtool_ringparam *cmd, + struct kernel_ethtool_ringparam *kernel_cmd, + struct netlink_ext_ack *extack) { struct gve_priv *priv = netdev_priv(netdev);
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c index bf0f0ca82cd0..e54a687a019f 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c @@ -664,9 +664,13 @@ static void hns_nic_get_drvinfo(struct net_device *net_dev, * hns_get_ringparam - get ring parameter * @net_dev: net device * @param: ethtool parameter + * @kernel_param: ethtool external parameter + * @extack: netlink extended ACK report struct */ static void hns_get_ringparam(struct net_device *net_dev, - struct ethtool_ringparam *param) + struct ethtool_ringparam *param, + struct kernel_ethtool_ringparam *kernel_param, + struct netlink_ext_ack *extack) { struct hns_nic_priv *priv = netdev_priv(net_dev); struct hnae_ae_ops *ops; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index 9a816dbec613..e35a2661b45d 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -643,7 +643,9 @@ static u32 hns3_get_link(struct net_device *netdev) }
static void hns3_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *param) + struct ethtool_ringparam *param, + struct kernel_ethtool_ringparam *kernel_param, + struct netlink_ext_ack *extack) { struct hns3_nic_priv *priv = netdev_priv(netdev); struct hnae3_handle *h = priv->ae_handle; @@ -1081,7 +1083,9 @@ static int hns3_check_ringparam(struct net_device *ndev, }
static int hns3_set_ringparam(struct net_device *ndev, - struct ethtool_ringparam *param) + struct ethtool_ringparam *param, + struct kernel_ethtool_ringparam *kernel_param, + struct netlink_ext_ack *extack) { struct hns3_nic_priv *priv = netdev_priv(ndev); struct hnae3_handle *h = priv->ae_handle; diff --git a/drivers/net/ethernet/huawei/hinic/hinic_ethtool.c b/drivers/net/ethernet/huawei/hinic/hinic_ethtool.c index 28c7a34d204a..26d695684cfb 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_ethtool.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_ethtool.c @@ -905,7 +905,9 @@ static int hinic_nway_reset(struct net_device *netdev) }
static void hinic_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct hinic_nic_dev *nic_dev = netdev_priv(netdev);
@@ -931,7 +933,9 @@ static void hinic_update_qp_depth(struct hinic_nic_dev *nic_dev, }
static int hinic_set_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct hinic_nic_dev *nic_dev = netdev_priv(netdev); u16 new_sq_depth, new_rq_depth; diff --git a/drivers/net/ethernet/ibm/emac/core.c b/drivers/net/ethernet/ibm/emac/core.c index c00b9097eeea..bb17357d8a13 100644 --- a/drivers/net/ethernet/ibm/emac/core.c +++ b/drivers/net/ethernet/ibm/emac/core.c @@ -2136,8 +2136,11 @@ emac_ethtool_set_link_ksettings(struct net_device *ndev, return 0; }
-static void emac_ethtool_get_ringparam(struct net_device *ndev, - struct ethtool_ringparam *rp) +static void +emac_ethtool_get_ringparam(struct net_device *ndev, + struct ethtool_ringparam *rp, + struct kernel_ethtool_ringparam *kernel_rp, + struct netlink_ext_ack *extack) { rp->rx_max_pending = rp->rx_pending = NUM_RX_BUFF; rp->tx_max_pending = rp->tx_pending = NUM_TX_BUFF; diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 4f99d9763824..fadef994320c 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -2659,7 +2659,9 @@ static u32 ibmvnic_get_link(struct net_device *netdev) }
static void ibmvnic_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct ibmvnic_adapter *adapter = netdev_priv(netdev);
@@ -2679,7 +2681,9 @@ static void ibmvnic_get_ringparam(struct net_device *netdev, }
static int ibmvnic_set_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct ibmvnic_adapter *adapter = netdev_priv(netdev); int ret; diff --git a/drivers/net/ethernet/intel/e100.c b/drivers/net/ethernet/intel/e100.c index ee86ea12fa37..afe824aad954 100644 --- a/drivers/net/ethernet/intel/e100.c +++ b/drivers/net/ethernet/intel/e100.c @@ -2551,7 +2551,9 @@ static int e100_set_eeprom(struct net_device *netdev, }
static void e100_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct nic *nic = netdev_priv(netdev); struct param_range *rfds = &nic->params.rfds; @@ -2564,7 +2566,9 @@ static void e100_get_ringparam(struct net_device *netdev, }
static int e100_set_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct nic *nic = netdev_priv(netdev); struct param_range *rfds = &nic->params.rfds; diff --git a/drivers/net/ethernet/intel/e1000/e1000_ethtool.c b/drivers/net/ethernet/intel/e1000/e1000_ethtool.c index 3ac044ad4f55..f41e3ca761c9 100644 --- a/drivers/net/ethernet/intel/e1000/e1000_ethtool.c +++ b/drivers/net/ethernet/intel/e1000/e1000_ethtool.c @@ -539,7 +539,9 @@ static void e1000_get_drvinfo(struct net_device *netdev, }
static void e1000_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct e1000_adapter *adapter = netdev_priv(netdev); struct e1000_hw *hw = &adapter->hw; @@ -556,7 +558,9 @@ static void e1000_get_ringparam(struct net_device *netdev, }
static int e1000_set_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct e1000_adapter *adapter = netdev_priv(netdev); struct e1000_hw *hw = &adapter->hw; diff --git a/drivers/net/ethernet/intel/e1000e/ethtool.c b/drivers/net/ethernet/intel/e1000e/ethtool.c index 9b2071d3ebdc..fe42a10871ef 100644 --- a/drivers/net/ethernet/intel/e1000e/ethtool.c +++ b/drivers/net/ethernet/intel/e1000e/ethtool.c @@ -655,7 +655,9 @@ static void e1000_get_drvinfo(struct net_device *netdev, }
static void e1000_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct e1000_adapter *adapter = netdev_priv(netdev);
@@ -666,7 +668,9 @@ static void e1000_get_ringparam(struct net_device *netdev, }
static int e1000_set_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct e1000_adapter *adapter = netdev_priv(netdev); struct e1000_ring *temp_tx = NULL, *temp_rx = NULL; diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c b/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c index 57b78566c364..feb42272f371 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c @@ -501,7 +501,9 @@ static void fm10k_set_msglevel(struct net_device *netdev, u32 data) }
static void fm10k_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct fm10k_intfc *interface = netdev_priv(netdev);
@@ -516,7 +518,9 @@ static void fm10k_get_ringparam(struct net_device *netdev, }
static int fm10k_set_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct fm10k_intfc *interface = netdev_priv(netdev); struct fm10k_ring *temp_ring; diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c index ffbfb8a091dd..97e89fd79081 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c @@ -1916,7 +1916,9 @@ static void i40e_get_drvinfo(struct net_device *netdev, }
static void i40e_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct i40e_netdev_priv *np = netdev_priv(netdev); struct i40e_pf *pf = np->vsi->back; @@ -1944,7 +1946,9 @@ static bool i40e_active_tx_ring_index(struct i40e_vsi *vsi, u16 index) }
static int i40e_set_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct i40e_ring *tx_rings = NULL, *rx_rings = NULL; struct i40e_netdev_priv *np = netdev_priv(netdev); diff --git a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c index 63458ee7021d..3d0db8273c92 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c +++ b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c @@ -580,12 +580,16 @@ static void iavf_get_drvinfo(struct net_device *netdev, * iavf_get_ringparam - Get ring parameters * @netdev: network interface device structure * @ring: ethtool ringparam structure + * @kernel_ring: ethtool extenal ringparam structure + * @extack: netlink extended ACK report struct * * Returns current ring parameters. TX and RX rings are reported separately, * but the number of rings is not reported. **/ static void iavf_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct iavf_adapter *adapter = netdev_priv(netdev);
@@ -599,12 +603,16 @@ static void iavf_get_ringparam(struct net_device *netdev, * iavf_set_ringparam - Set ring parameters * @netdev: network interface device structure * @ring: ethtool ringparam structure + * @kernel_ring: ethtool external ringparam structure + * @extack: netlink extended ACK report struct * * Sets ring parameters. TX and RX rings are controlled separately, but the * number of rings is not specified, so all rings get the same settings. **/ static int iavf_set_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct iavf_adapter *adapter = netdev_priv(netdev); u32 new_rx_count, new_tx_count; diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 76f5dd8d08a5..0c596b67b689 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -2679,7 +2679,9 @@ ice_get_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd, }
static void -ice_get_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring) +ice_get_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct ice_netdev_priv *np = netdev_priv(netdev); struct ice_vsi *vsi = np->vsi; @@ -2697,7 +2699,9 @@ ice_get_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring) }
static int -ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring) +ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct ice_ring *tx_rings = NULL, *rx_rings = NULL; struct ice_netdev_priv *np = netdev_priv(netdev); diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c index c75bbbd39eff..f09c7dacdf1e 100644 --- a/drivers/net/ethernet/intel/igb/igb_ethtool.c +++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c @@ -864,7 +864,9 @@ static void igb_get_drvinfo(struct net_device *netdev, }
static void igb_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct igb_adapter *adapter = netdev_priv(netdev);
@@ -875,7 +877,9 @@ static void igb_get_ringparam(struct net_device *netdev, }
static int igb_set_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct igb_adapter *adapter = netdev_priv(netdev); struct igb_ring *temp_ring; diff --git a/drivers/net/ethernet/intel/igbvf/ethtool.c b/drivers/net/ethernet/intel/igbvf/ethtool.c index 06e5bd646a0e..9d4322b74163 100644 --- a/drivers/net/ethernet/intel/igbvf/ethtool.c +++ b/drivers/net/ethernet/intel/igbvf/ethtool.c @@ -175,7 +175,9 @@ static void igbvf_get_drvinfo(struct net_device *netdev, }
static void igbvf_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct igbvf_adapter *adapter = netdev_priv(netdev); struct igbvf_ring *tx_ring = adapter->tx_ring; @@ -188,7 +190,9 @@ static void igbvf_get_ringparam(struct net_device *netdev, }
static int igbvf_set_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct igbvf_adapter *adapter = netdev_priv(netdev); struct igbvf_ring *temp_ring; diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c index 97ce3397f47f..63da96f69907 100644 --- a/drivers/net/ethernet/intel/igc/igc_ethtool.c +++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c @@ -549,8 +549,11 @@ static int igc_ethtool_set_eeprom(struct net_device *netdev, return ret_val; }
-static void igc_ethtool_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) +static void +igc_ethtool_get_ringparam(struct net_device *netdev, + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct igc_adapter *adapter = netdev_priv(netdev);
@@ -560,8 +563,11 @@ static void igc_ethtool_get_ringparam(struct net_device *netdev, ring->tx_pending = adapter->tx_ring_count; }
-static int igc_ethtool_set_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) +static int +igc_ethtool_set_ringparam(struct net_device *netdev, + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct igc_adapter *adapter = netdev_priv(netdev); struct igc_ring *temp_ring; diff --git a/drivers/net/ethernet/intel/ixgb/ixgb_ethtool.c b/drivers/net/ethernet/intel/ixgb/ixgb_ethtool.c index 582099a5ad41..46efcfab7234 100644 --- a/drivers/net/ethernet/intel/ixgb/ixgb_ethtool.c +++ b/drivers/net/ethernet/intel/ixgb/ixgb_ethtool.c @@ -464,7 +464,9 @@ ixgb_get_drvinfo(struct net_device *netdev,
static void ixgb_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct ixgb_adapter *adapter = netdev_priv(netdev); struct ixgb_desc_ring *txdr = &adapter->tx_ring; @@ -478,7 +480,9 @@ ixgb_get_ringparam(struct net_device *netdev,
static int ixgb_set_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct ixgb_adapter *adapter = netdev_priv(netdev); struct ixgb_desc_ring *txdr = &adapter->tx_ring; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c index bb2c5c63f7f2..089e1b64b59a 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c @@ -1119,7 +1119,9 @@ static void ixgbe_get_drvinfo(struct net_device *netdev, }
static void ixgbe_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct ixgbe_adapter *adapter = netdev_priv(netdev); struct ixgbe_ring *tx_ring = adapter->tx_ring[0]; @@ -1132,7 +1134,9 @@ static void ixgbe_get_ringparam(struct net_device *netdev, }
static int ixgbe_set_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct ixgbe_adapter *adapter = netdev_priv(netdev); struct ixgbe_ring *temp_ring; diff --git a/drivers/net/ethernet/intel/ixgbevf/ethtool.c b/drivers/net/ethernet/intel/ixgbevf/ethtool.c index 8380f905e708..3b41f83c8dff 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ethtool.c +++ b/drivers/net/ethernet/intel/ixgbevf/ethtool.c @@ -225,7 +225,9 @@ static void ixgbevf_get_drvinfo(struct net_device *netdev, }
static void ixgbevf_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct ixgbevf_adapter *adapter = netdev_priv(netdev);
@@ -236,7 +238,9 @@ static void ixgbevf_get_ringparam(struct net_device *netdev, }
static int ixgbevf_set_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct ixgbevf_adapter *adapter = netdev_priv(netdev); struct ixgbevf_ring *tx_ring = NULL, *rx_ring = NULL; diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c index 1f65a6e5685c..c25c2eb3aa7e 100644 --- a/drivers/net/ethernet/marvell/mv643xx_eth.c +++ b/drivers/net/ethernet/marvell/mv643xx_eth.c @@ -1636,7 +1636,9 @@ static int mv643xx_eth_set_coalesce(struct net_device *dev, }
static void -mv643xx_eth_get_ringparam(struct net_device *dev, struct ethtool_ringparam *er) +mv643xx_eth_get_ringparam(struct net_device *dev, struct ethtool_ringparam *er, + struct kernel_ethtool_ringparam *kernel_er, + struct netlink_ext_ack *extack) { struct mv643xx_eth_private *mp = netdev_priv(dev);
@@ -1648,7 +1650,9 @@ mv643xx_eth_get_ringparam(struct net_device *dev, struct ethtool_ringparam *er) }
static int -mv643xx_eth_set_ringparam(struct net_device *dev, struct ethtool_ringparam *er) +mv643xx_eth_set_ringparam(struct net_device *dev, struct ethtool_ringparam *er, + struct kernel_ethtool_ringparam *kernel_er, + struct netlink_ext_ack *extack) { struct mv643xx_eth_private *mp = netdev_priv(dev);
diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 7fc44c43f4ee..10a53303de85 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -4534,8 +4534,11 @@ static void mvneta_ethtool_get_drvinfo(struct net_device *dev, }
-static void mvneta_ethtool_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) +static void +mvneta_ethtool_get_ringparam(struct net_device *netdev, + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct mvneta_port *pp = netdev_priv(netdev);
@@ -4545,8 +4548,11 @@ static void mvneta_ethtool_get_ringparam(struct net_device *netdev, ring->tx_pending = pp->tx_ring_size; }
-static int mvneta_ethtool_set_ringparam(struct net_device *dev, - struct ethtool_ringparam *ring) +static int +mvneta_ethtool_set_ringparam(struct net_device *dev, + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct mvneta_port *pp = netdev_priv(dev);
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index ac75a215772d..27f7792893c4 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -5070,8 +5070,11 @@ static void mvpp2_ethtool_get_drvinfo(struct net_device *dev, sizeof(drvinfo->bus_info)); }
-static void mvpp2_ethtool_get_ringparam(struct net_device *dev, - struct ethtool_ringparam *ring) +static void +mvpp2_ethtool_get_ringparam(struct net_device *dev, + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct mvpp2_port *port = netdev_priv(dev);
@@ -5081,8 +5084,11 @@ static void mvpp2_ethtool_get_ringparam(struct net_device *dev, ring->tx_pending = port->tx_ring_size; }
-static int mvpp2_ethtool_set_ringparam(struct net_device *dev, - struct ethtool_ringparam *ring) +static int +mvpp2_ethtool_set_ringparam(struct net_device *dev, + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct mvpp2_port *port = netdev_priv(dev); u16 prev_rx_ring_size = port->rx_ring_size; diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c index dc8d1b01a7dc..457702db2693 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c @@ -291,7 +291,9 @@ static int otx2_set_pauseparam(struct net_device *netdev, }
static void otx2_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct otx2_nic *pfvf = netdev_priv(netdev); struct otx2_qset *qs = &pfvf->qset; @@ -303,7 +305,9 @@ static void otx2_get_ringparam(struct net_device *netdev, }
static int otx2_set_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct otx2_nic *pfvf = netdev_priv(netdev); bool if_up = netif_running(netdev); diff --git a/drivers/net/ethernet/marvell/skge.c b/drivers/net/ethernet/marvell/skge.c index 715b2483e7fc..e65aaf4e768e 100644 --- a/drivers/net/ethernet/marvell/skge.c +++ b/drivers/net/ethernet/marvell/skge.c @@ -492,7 +492,9 @@ static void skge_get_strings(struct net_device *dev, u32 stringset, u8 *data) }
static void skge_get_ring_param(struct net_device *dev, - struct ethtool_ringparam *p) + struct ethtool_ringparam *p, + struct kernel_ethtool_ringparam *kernel_p, + struct netlink_ext_ack *extack) { struct skge_port *skge = netdev_priv(dev);
@@ -504,7 +506,9 @@ static void skge_get_ring_param(struct net_device *dev, }
static int skge_set_ring_param(struct net_device *dev, - struct ethtool_ringparam *p) + struct ethtool_ringparam *p, + struct kernel_ethtool_ringparam *kernel_p, + struct netlink_ext_ack *extack) { struct skge_port *skge = netdev_priv(dev); int err = 0; diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c index a035773a82e6..26ca9655f248 100644 --- a/drivers/net/ethernet/marvell/sky2.c +++ b/drivers/net/ethernet/marvell/sky2.c @@ -4147,7 +4147,9 @@ static unsigned long roundup_ring_size(unsigned long pending) }
static void sky2_get_ringparam(struct net_device *dev, - struct ethtool_ringparam *ering) + struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct sky2_port *sky2 = netdev_priv(dev);
@@ -4159,7 +4161,9 @@ static void sky2_get_ringparam(struct net_device *dev, }
static int sky2_set_ringparam(struct net_device *dev, - struct ethtool_ringparam *ering) + struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct sky2_port *sky2 = netdev_priv(dev);
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c index ef518b1040f7..9913819d62d6 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c @@ -1138,7 +1138,9 @@ static void mlx4_en_get_pauseparam(struct net_device *dev, }
static int mlx4_en_set_ringparam(struct net_device *dev, - struct ethtool_ringparam *param) + struct ethtool_ringparam *param, + struct kernel_ethtool_ringparam *kernel_param, + struct netlink_ext_ack *extack) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_dev *mdev = priv->mdev; @@ -1205,7 +1207,9 @@ static int mlx4_en_set_ringparam(struct net_device *dev, }
static void mlx4_en_get_ringparam(struct net_device *dev, - struct ethtool_ringparam *param) + struct ethtool_ringparam *param, + struct kernel_ethtool_ringparam *kernel_param, + struct netlink_ext_ack *extack) { struct mlx4_en_priv *priv = netdev_priv(dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index c798e634885f..584751da20b1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -316,7 +316,9 @@ void mlx5e_ethtool_get_ringparam(struct mlx5e_priv *priv, }
static void mlx5e_get_ringparam(struct net_device *dev, - struct ethtool_ringparam *param) + struct ethtool_ringparam *param, + struct kernel_ethtool_ringparam *kernel_param, + struct netlink_ext_ack *extack) { struct mlx5e_priv *priv = netdev_priv(dev);
@@ -383,7 +385,9 @@ int mlx5e_ethtool_set_ringparam(struct mlx5e_priv *priv, }
static int mlx5e_set_ringparam(struct net_device *dev, - struct ethtool_ringparam *param) + struct ethtool_ringparam *param, + struct kernel_ethtool_ringparam *kernel_param, + struct netlink_ext_ack *extack) { struct mlx5e_priv *priv = netdev_priv(dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 5bbf8e686708..5778ce5ff4e8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -225,16 +225,22 @@ static int mlx5e_rep_get_sset_count(struct net_device *dev, int sset) } }
-static void mlx5e_rep_get_ringparam(struct net_device *dev, - struct ethtool_ringparam *param) +static void +mlx5e_rep_get_ringparam(struct net_device *dev, + struct ethtool_ringparam *param, + struct kernel_ethtool_ringparam *kernel_param, + struct netlink_ext_ack *extack) { struct mlx5e_priv *priv = netdev_priv(dev);
mlx5e_ethtool_get_ringparam(priv, param); }
-static int mlx5e_rep_set_ringparam(struct net_device *dev, - struct ethtool_ringparam *param) +static int +mlx5e_rep_set_ringparam(struct net_device *dev, + struct ethtool_ringparam *param, + struct kernel_ethtool_ringparam *kernel_param, + struct netlink_ext_ack *extack) { struct mlx5e_priv *priv = netdev_priv(dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c index 04c9a138029b..5c7eee2f6697 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c @@ -67,7 +67,9 @@ static void mlx5i_get_ethtool_stats(struct net_device *dev, }
static int mlx5i_set_ringparam(struct net_device *dev, - struct ethtool_ringparam *param) + struct ethtool_ringparam *param, + struct kernel_ethtool_ringparam *kernel_param, + struct netlink_ext_ack *extack) { struct mlx5e_priv *priv = mlx5i_epriv(dev);
@@ -75,7 +77,9 @@ static int mlx5i_set_ringparam(struct net_device *dev, }
static void mlx5i_get_ringparam(struct net_device *dev, - struct ethtool_ringparam *param) + struct ethtool_ringparam *param, + struct kernel_ethtool_ringparam *kernel_param, + struct netlink_ext_ack *extack) { struct mlx5e_priv *priv = mlx5i_epriv(dev);
diff --git a/drivers/net/ethernet/micrel/ksz884x.c b/drivers/net/ethernet/micrel/ksz884x.c index 9ed264ed7070..8a45c4577d38 100644 --- a/drivers/net/ethernet/micrel/ksz884x.c +++ b/drivers/net/ethernet/micrel/ksz884x.c @@ -6392,11 +6392,15 @@ static int netdev_set_pauseparam(struct net_device *dev, * netdev_get_ringparam - get tx/rx ring parameters * @dev: Network device. * @ring: Ethtool RING settings data structure. + * @kernel_ring: Ethtool external RING settings data structure. + * @extack: Netlink handle. * * This procedure returns the TX/RX ring settings. */ static void netdev_get_ringparam(struct net_device *dev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct dev_priv *priv = netdev_priv(dev); struct dev_info *hw_priv = priv->adapter; diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c index 6b69512e742e..b5c4a64bc025 100644 --- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c +++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c @@ -1702,7 +1702,9 @@ myri10ge_set_pauseparam(struct net_device *netdev,
static void myri10ge_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct myri10ge_priv *mgp = netdev_priv(netdev);
diff --git a/drivers/net/ethernet/neterion/s2io.c b/drivers/net/ethernet/neterion/s2io.c index 3cae8449fadb..5b0badc88a97 100644 --- a/drivers/net/ethernet/neterion/s2io.c +++ b/drivers/net/ethernet/neterion/s2io.c @@ -5450,8 +5450,11 @@ static int s2io_ethtool_set_led(struct net_device *dev, return 0; }
-static void s2io_ethtool_gringparam(struct net_device *dev, - struct ethtool_ringparam *ering) +static void +s2io_ethtool_gringparam(struct net_device *dev, + struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct s2io_nic *sp = netdev_priv(dev); int i, tx_desc_count = 0, rx_desc_count = 0; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c index c0e39b81eb74..880e73ed6966 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c @@ -380,7 +380,9 @@ nfp_net_set_link_ksettings(struct net_device *netdev, }
static void nfp_net_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct nfp_net *nn = netdev_priv(netdev);
@@ -405,7 +407,9 @@ static int nfp_net_set_ring_size(struct nfp_net *nn, u32 rxd_cnt, u32 txd_cnt) }
static int nfp_net_set_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct nfp_net *nn = netdev_priv(netdev); u32 rxd_cnt, txd_cnt; diff --git a/drivers/net/ethernet/netswift/txgbe/txgbe_ethtool.c b/drivers/net/ethernet/netswift/txgbe/txgbe_ethtool.c index ca7265259a36..399bec32aa3f 100644 --- a/drivers/net/ethernet/netswift/txgbe/txgbe_ethtool.c +++ b/drivers/net/ethernet/netswift/txgbe/txgbe_ethtool.c @@ -1050,7 +1050,9 @@ static void txgbe_get_drvinfo(struct net_device *netdev, }
static void txgbe_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct txgbe_adapter *adapter = netdev_priv(netdev);
@@ -1065,7 +1067,9 @@ static void txgbe_get_ringparam(struct net_device *netdev, }
static int txgbe_set_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct txgbe_adapter *adapter = netdev_priv(netdev); struct txgbe_ring *temp_ring; diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c index 2fc10a36afa4..7bdb146b3520 100644 --- a/drivers/net/ethernet/nvidia/forcedeth.c +++ b/drivers/net/ethernet/nvidia/forcedeth.c @@ -4656,7 +4656,10 @@ static int nv_nway_reset(struct net_device *dev) return ret; }
-static void nv_get_ringparam(struct net_device *dev, struct ethtool_ringparam* ring) +static void nv_get_ringparam(struct net_device *dev, + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct fe_priv *np = netdev_priv(dev);
@@ -4667,7 +4670,10 @@ static void nv_get_ringparam(struct net_device *dev, struct ethtool_ringparam* r ring->tx_pending = np->tx_ring_size; }
-static int nv_set_ringparam(struct net_device *dev, struct ethtool_ringparam* ring) +static int nv_set_ringparam(struct net_device *dev, + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct fe_priv *np = netdev_priv(dev); u8 __iomem *base = get_hwbase(dev); diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_ethtool.c b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_ethtool.c index a58f14aca10c..0e3bcf82686c 100644 --- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_ethtool.c +++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_ethtool.c @@ -268,9 +268,13 @@ static int pch_gbe_nway_reset(struct net_device *netdev) * pch_gbe_get_ringparam - Report ring sizes * @netdev: Network interface device structure * @ring: Ring param structure + * @kernel_ring: Ring external param structure + * @extack: netlink handle */ static void pch_gbe_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct pch_gbe_adapter *adapter = netdev_priv(netdev); struct pch_gbe_tx_ring *txdr = adapter->tx_ring; @@ -286,12 +290,16 @@ static void pch_gbe_get_ringparam(struct net_device *netdev, * pch_gbe_set_ringparam - Set ring sizes * @netdev: Network interface device structure * @ring: Ring param structure + * @kernel_ring: Ring external param structure + * @extack: netlink handle * Returns * 0: Successful. * Negative value: Failed. */ static int pch_gbe_set_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct pch_gbe_adapter *adapter = netdev_priv(netdev); struct pch_gbe_tx_ring *txdr, *tx_old; diff --git a/drivers/net/ethernet/pasemi/pasemi_mac_ethtool.c b/drivers/net/ethernet/pasemi/pasemi_mac_ethtool.c index e1a304886a3c..4c7e0c991105 100644 --- a/drivers/net/ethernet/pasemi/pasemi_mac_ethtool.c +++ b/drivers/net/ethernet/pasemi/pasemi_mac_ethtool.c @@ -69,7 +69,9 @@ pasemi_mac_ethtool_set_msglevel(struct net_device *netdev,
static void pasemi_mac_ethtool_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ering) + struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct pasemi_mac *mac = netdev_priv(netdev);
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c index f968cd26c56e..45ea730cae88 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c @@ -511,7 +511,9 @@ static int ionic_set_coalesce(struct net_device *netdev, }
static void ionic_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct ionic_lif *lif = netdev_priv(netdev);
@@ -522,7 +524,9 @@ static void ionic_get_ringparam(struct net_device *netdev, }
static int ionic_set_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct ionic_lif *lif = netdev_priv(netdev); struct ionic_queue_params qparam; diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_ethtool.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_ethtool.c index a075643f5826..3c4a84ea6321 100644 --- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_ethtool.c +++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_ethtool.c @@ -392,7 +392,9 @@ netxen_nic_get_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom,
static void netxen_nic_get_ringparam(struct net_device *dev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct netxen_adapter *adapter = netdev_priv(dev);
@@ -430,7 +432,9 @@ netxen_validate_ringparam(u32 val, u32 min, u32 max, char *r_name)
static int netxen_nic_set_ringparam(struct net_device *dev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct netxen_adapter *adapter = netdev_priv(dev); u16 max_rcv_desc = MAX_RCV_DESCRIPTORS_10G; diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c index 4360c6602356..43380e40357a 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c +++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c @@ -885,7 +885,9 @@ static int qede_set_coalesce(struct net_device *dev, }
static void qede_get_ringparam(struct net_device *dev, - struct ethtool_ringparam *ering) + struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct qede_dev *edev = netdev_priv(dev);
@@ -896,7 +898,9 @@ static void qede_get_ringparam(struct net_device *dev, }
static int qede_set_ringparam(struct net_device *dev, - struct ethtool_ringparam *ering) + struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct qede_dev *edev = netdev_priv(dev);
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ethtool.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ethtool.c index 7ac4b5f33378..9a803423b05f 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ethtool.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ethtool.c @@ -632,7 +632,9 @@ qlcnic_get_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom,
static void qlcnic_get_ringparam(struct net_device *dev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct qlcnic_adapter *adapter = netdev_priv(dev);
@@ -663,7 +665,9 @@ qlcnic_validate_ringparam(u32 val, u32 min, u32 max, char *r_name)
static int qlcnic_set_ringparam(struct net_device *dev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct qlcnic_adapter *adapter = netdev_priv(dev); u16 num_rxd, num_jumbo_rxd, num_txd; diff --git a/drivers/net/ethernet/qualcomm/emac/emac-ethtool.c b/drivers/net/ethernet/qualcomm/emac/emac-ethtool.c index 79e50079ed03..54ed7b79c3e2 100644 --- a/drivers/net/ethernet/qualcomm/emac/emac-ethtool.c +++ b/drivers/net/ethernet/qualcomm/emac/emac-ethtool.c @@ -133,7 +133,9 @@ static int emac_nway_reset(struct net_device *netdev) }
static void emac_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct emac_adapter *adpt = netdev_priv(netdev);
@@ -144,7 +146,9 @@ static void emac_get_ringparam(struct net_device *netdev, }
static int emac_set_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct emac_adapter *adpt = netdev_priv(netdev);
diff --git a/drivers/net/ethernet/qualcomm/qca_debug.c b/drivers/net/ethernet/qualcomm/qca_debug.c index 702aa217a27a..da87019a0160 100644 --- a/drivers/net/ethernet/qualcomm/qca_debug.c +++ b/drivers/net/ethernet/qualcomm/qca_debug.c @@ -245,7 +245,9 @@ qcaspi_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *p) }
static void -qcaspi_get_ringparam(struct net_device *dev, struct ethtool_ringparam *ring) +qcaspi_get_ringparam(struct net_device *dev, struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct qcaspi *qca = netdev_priv(dev);
@@ -256,7 +258,9 @@ qcaspi_get_ringparam(struct net_device *dev, struct ethtool_ringparam *ring) }
static int -qcaspi_set_ringparam(struct net_device *dev, struct ethtool_ringparam *ring) +qcaspi_set_ringparam(struct net_device *dev, struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; struct qcaspi *qca = netdev_priv(dev); diff --git a/drivers/net/ethernet/ramaxel/spnic/spnic_ethtool.c b/drivers/net/ethernet/ramaxel/spnic/spnic_ethtool.c index dc49395c47d5..e3a7e81a601b 100644 --- a/drivers/net/ethernet/ramaxel/spnic/spnic_ethtool.c +++ b/drivers/net/ethernet/ramaxel/spnic/spnic_ethtool.c @@ -94,7 +94,9 @@ int spnic_nway_reset(struct net_device *netdev) return 0; }
-static void spnic_get_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring) +static void spnic_get_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct spnic_nic_dev *nic_dev = netdev_priv(netdev);
@@ -141,7 +143,9 @@ static int check_ringparam_valid(struct net_device *netdev, struct ethtool_ringp return 0; }
-static int spnic_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring) +static int spnic_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct spnic_nic_dev *nic_dev = netdev_priv(netdev); struct spnic_dyna_txrxq_params q_params = {0}; diff --git a/drivers/net/ethernet/realtek/8139cp.c b/drivers/net/ethernet/realtek/8139cp.c index 4e44313b7651..3d41de743840 100644 --- a/drivers/net/ethernet/realtek/8139cp.c +++ b/drivers/net/ethernet/realtek/8139cp.c @@ -1388,7 +1388,9 @@ static void cp_get_drvinfo (struct net_device *dev, struct ethtool_drvinfo *info }
static void cp_get_ringparam(struct net_device *dev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { ring->rx_max_pending = CP_RX_RING_SIZE; ring->tx_max_pending = CP_TX_RING_SIZE; diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index f96eed67e1a2..59dcdedb7d2a 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -1182,7 +1182,9 @@ static void ravb_get_strings(struct net_device *ndev, u32 stringset, u8 *data) }
static void ravb_get_ringparam(struct net_device *ndev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct ravb_private *priv = netdev_priv(ndev);
@@ -1193,7 +1195,9 @@ static void ravb_get_ringparam(struct net_device *ndev, }
static int ravb_set_ringparam(struct net_device *ndev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct ravb_private *priv = netdev_priv(ndev); int error; diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index 8927d5997745..19802e05d03d 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -2294,7 +2294,9 @@ static void sh_eth_get_strings(struct net_device *ndev, u32 stringset, u8 *data) }
static void sh_eth_get_ringparam(struct net_device *ndev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct sh_eth_private *mdp = netdev_priv(ndev);
@@ -2305,7 +2307,9 @@ static void sh_eth_get_ringparam(struct net_device *ndev, }
static int sh_eth_set_ringparam(struct net_device *ndev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct sh_eth_private *mdp = netdev_priv(ndev); int ret; diff --git a/drivers/net/ethernet/sfc/ef100_ethtool.c b/drivers/net/ethernet/sfc/ef100_ethtool.c index 835c838b7dfa..5dba4125d953 100644 --- a/drivers/net/ethernet/sfc/ef100_ethtool.c +++ b/drivers/net/ethernet/sfc/ef100_ethtool.c @@ -20,8 +20,11 @@ /* This is the maximum number of descriptor rings supported by the QDMA */ #define EFX_EF100_MAX_DMAQ_SIZE 16384UL
-static void ef100_ethtool_get_ringparam(struct net_device *net_dev, - struct ethtool_ringparam *ring) +static void +ef100_ethtool_get_ringparam(struct net_device *net_dev, + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct efx_nic *efx = netdev_priv(net_dev);
diff --git a/drivers/net/ethernet/sfc/ethtool.c b/drivers/net/ethernet/sfc/ethtool.c index 286056a063aa..1a19172ea70f 100644 --- a/drivers/net/ethernet/sfc/ethtool.c +++ b/drivers/net/ethernet/sfc/ethtool.c @@ -157,8 +157,11 @@ static int efx_ethtool_set_coalesce(struct net_device *net_dev, return 0; }
-static void efx_ethtool_get_ringparam(struct net_device *net_dev, - struct ethtool_ringparam *ring) +static void +efx_ethtool_get_ringparam(struct net_device *net_dev, + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct efx_nic *efx = netdev_priv(net_dev);
@@ -168,8 +171,11 @@ static void efx_ethtool_get_ringparam(struct net_device *net_dev, ring->tx_pending = efx->txq_entries; }
-static int efx_ethtool_set_ringparam(struct net_device *net_dev, - struct ethtool_ringparam *ring) +static int +efx_ethtool_set_ringparam(struct net_device *net_dev, + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct efx_nic *efx = netdev_priv(net_dev); u32 txq_entries; diff --git a/drivers/net/ethernet/sfc/falcon/ethtool.c b/drivers/net/ethernet/sfc/falcon/ethtool.c index 137e8a7aeaa1..907254b36663 100644 --- a/drivers/net/ethernet/sfc/falcon/ethtool.c +++ b/drivers/net/ethernet/sfc/falcon/ethtool.c @@ -637,8 +637,11 @@ static int ef4_ethtool_set_coalesce(struct net_device *net_dev, return 0; }
-static void ef4_ethtool_get_ringparam(struct net_device *net_dev, - struct ethtool_ringparam *ring) +static void +ef4_ethtool_get_ringparam(struct net_device *net_dev, + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct ef4_nic *efx = netdev_priv(net_dev);
@@ -648,8 +651,11 @@ static void ef4_ethtool_get_ringparam(struct net_device *net_dev, ring->tx_pending = efx->txq_entries; }
-static int ef4_ethtool_set_ringparam(struct net_device *net_dev, - struct ethtool_ringparam *ring) +static int +ef4_ethtool_set_ringparam(struct net_device *net_dev, + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct ef4_nic *efx = netdev_priv(net_dev); u32 txq_entries; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c index 7b5f29617795..2d11fcaad76f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c @@ -441,7 +441,9 @@ static int stmmac_nway_reset(struct net_device *dev) }
static void stmmac_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct stmmac_priv *priv = netdev_priv(netdev);
@@ -452,7 +454,9 @@ static void stmmac_get_ringparam(struct net_device *netdev, }
static int stmmac_set_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { if (ring->rx_mini_pending || ring->rx_jumbo_pending || ring->rx_pending < DMA_MIN_RX_SIZE || diff --git a/drivers/net/ethernet/tehuti/tehuti.c b/drivers/net/ethernet/tehuti/tehuti.c index 7aebb2228dc8..247a7b6ccb41 100644 --- a/drivers/net/ethernet/tehuti/tehuti.c +++ b/drivers/net/ethernet/tehuti/tehuti.c @@ -2251,9 +2251,13 @@ static inline int bdx_tx_fifo_size_to_packets(int tx_size) * bdx_get_ringparam - report ring sizes * @netdev * @ring + * @kernel_ring + * @extack */ static void -bdx_get_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring) +bdx_get_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct bdx_priv *priv = netdev_priv(netdev);
@@ -2268,9 +2272,13 @@ bdx_get_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring) * bdx_set_ringparam - set ring sizes * @netdev * @ring + * @kernel_ring + * @extack */ static int -bdx_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring) +bdx_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct bdx_priv *priv = netdev_priv(netdev); int rx_size = 0; diff --git a/drivers/net/ethernet/ti/am65-cpsw-ethtool.c b/drivers/net/ethernet/ti/am65-cpsw-ethtool.c index 6e4d4f9e32e0..ecf7ad5916db 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-ethtool.c +++ b/drivers/net/ethernet/ti/am65-cpsw-ethtool.c @@ -453,8 +453,11 @@ static int am65_cpsw_set_channels(struct net_device *ndev, return am65_cpsw_nuss_update_tx_chns(common, chs->tx_count); }
-static void am65_cpsw_get_ringparam(struct net_device *ndev, - struct ethtool_ringparam *ering) +static void +am65_cpsw_get_ringparam(struct net_device *ndev, + struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct am65_cpsw_common *common = am65_ndev_to_common(ndev);
diff --git a/drivers/net/ethernet/ti/cpmac.c b/drivers/net/ethernet/ti/cpmac.c index c20715107075..12b27e7c172b 100644 --- a/drivers/net/ethernet/ti/cpmac.c +++ b/drivers/net/ethernet/ti/cpmac.c @@ -817,7 +817,9 @@ static void cpmac_tx_timeout(struct net_device *dev, unsigned int txqueue) }
static void cpmac_get_ringparam(struct net_device *dev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct cpmac_priv *priv = netdev_priv(dev);
@@ -833,7 +835,9 @@ static void cpmac_get_ringparam(struct net_device *dev, }
static int cpmac_set_ringparam(struct net_device *dev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct cpmac_priv *priv = netdev_priv(dev);
diff --git a/drivers/net/ethernet/ti/cpsw_ethtool.c b/drivers/net/ethernet/ti/cpsw_ethtool.c index 158c8d3793f4..aa42141be3c0 100644 --- a/drivers/net/ethernet/ti/cpsw_ethtool.c +++ b/drivers/net/ethernet/ti/cpsw_ethtool.c @@ -658,7 +658,9 @@ int cpsw_set_channels_common(struct net_device *ndev, }
void cpsw_get_ringparam(struct net_device *ndev, - struct ethtool_ringparam *ering) + struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct cpsw_priv *priv = netdev_priv(ndev); struct cpsw_common *cpsw = priv->cpsw; @@ -671,7 +673,9 @@ void cpsw_get_ringparam(struct net_device *ndev, }
int cpsw_set_ringparam(struct net_device *ndev, - struct ethtool_ringparam *ering) + struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct cpsw_common *cpsw = ndev_to_cpsw(ndev); int descs_num, ret; diff --git a/drivers/net/ethernet/ti/cpsw_priv.h b/drivers/net/ethernet/ti/cpsw_priv.h index 2011a86e7571..fedc722bdaf9 100644 --- a/drivers/net/ethernet/ti/cpsw_priv.h +++ b/drivers/net/ethernet/ti/cpsw_priv.h @@ -491,9 +491,13 @@ int cpsw_get_eee(struct net_device *ndev, struct ethtool_eee *edata); int cpsw_set_eee(struct net_device *ndev, struct ethtool_eee *edata); int cpsw_nway_reset(struct net_device *ndev); void cpsw_get_ringparam(struct net_device *ndev, - struct ethtool_ringparam *ering); + struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack); int cpsw_set_ringparam(struct net_device *ndev, - struct ethtool_ringparam *ering); + struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack); int cpsw_set_channels_common(struct net_device *ndev, struct ethtool_channels *chs, cpdma_handler_fn rx_handler); diff --git a/drivers/net/ethernet/toshiba/spider_net_ethtool.c b/drivers/net/ethernet/toshiba/spider_net_ethtool.c index 54f655a68148..93110dba0bfa 100644 --- a/drivers/net/ethernet/toshiba/spider_net_ethtool.c +++ b/drivers/net/ethernet/toshiba/spider_net_ethtool.c @@ -110,7 +110,9 @@ spider_net_ethtool_nway_reset(struct net_device *netdev)
static void spider_net_ethtool_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ering) + struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct spider_net_card *card = netdev_priv(netdev);
diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c index b53f3b58e8a7..52f184500dd6 100644 --- a/drivers/net/ethernet/xilinx/ll_temac_main.c +++ b/drivers/net/ethernet/xilinx/ll_temac_main.c @@ -1276,8 +1276,11 @@ static const struct attribute_group temac_attr_group = { * ethtool support */
-static void ll_temac_ethtools_get_ringparam(struct net_device *ndev, - struct ethtool_ringparam *ering) +static void +ll_temac_ethtools_get_ringparam(struct net_device *ndev, + struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct temac_local *lp = netdev_priv(ndev);
@@ -1291,8 +1294,11 @@ static void ll_temac_ethtools_get_ringparam(struct net_device *ndev, ering->tx_pending = lp->tx_bd_num; }
-static int ll_temac_ethtools_set_ringparam(struct net_device *ndev, - struct ethtool_ringparam *ering) +static int +ll_temac_ethtools_set_ringparam(struct net_device *ndev, + struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct temac_local *lp = netdev_priv(ndev);
diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index 11c1c5a3240c..89ef84402187 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -1332,8 +1332,11 @@ static void axienet_ethtools_get_regs(struct net_device *ndev, data[39] = axienet_dma_in32(lp, XAXIDMA_RX_TDESC_OFFSET); }
-static void axienet_ethtools_get_ringparam(struct net_device *ndev, - struct ethtool_ringparam *ering) +static void +axienet_ethtools_get_ringparam(struct net_device *ndev, + struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct axienet_local *lp = netdev_priv(ndev);
@@ -1347,8 +1350,11 @@ static void axienet_ethtools_get_ringparam(struct net_device *ndev, ering->tx_pending = lp->tx_bd_num; }
-static int axienet_ethtools_set_ringparam(struct net_device *ndev, - struct ethtool_ringparam *ering) +static int +axienet_ethtools_set_ringparam(struct net_device *ndev, + struct ethtool_ringparam *ering, + struct kernel_ethtool_ringparam *kernel_ering, + struct netlink_ext_ack *extack) { struct axienet_local *lp = netdev_priv(ndev);
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 261e6e55a907..c042e22aa704 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -1844,7 +1844,9 @@ static void __netvsc_get_ringparam(struct netvsc_device *nvdev, }
static void netvsc_get_ringparam(struct net_device *ndev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct net_device_context *ndevctx = netdev_priv(ndev); struct netvsc_device *nvdev = rtnl_dereference(ndevctx->nvdev); @@ -1856,7 +1858,9 @@ static void netvsc_get_ringparam(struct net_device *ndev, }
static int netvsc_set_ringparam(struct net_device *ndev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct net_device_context *ndevctx = netdev_priv(ndev); struct netvsc_device *nvdev = rtnl_dereference(ndevctx->nvdev); diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 63b64fa7f33c..842f16815395 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -6307,7 +6307,9 @@ static int rtl8152_set_tunable(struct net_device *netdev, }
static void rtl8152_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct r8152 *tp = netdev_priv(netdev);
@@ -6316,7 +6318,9 @@ static void rtl8152_get_ringparam(struct net_device *netdev, }
static int rtl8152_set_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct r8152 *tp = netdev_priv(netdev);
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 18f0122d4576..362cb347d520 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2108,7 +2108,9 @@ static void virtnet_cpu_notif_remove(struct virtnet_info *vi) }
static void virtnet_get_ringparam(struct net_device *dev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { struct virtnet_info *vi = netdev_priv(dev);
diff --git a/drivers/net/vmxnet3/vmxnet3_ethtool.c b/drivers/net/vmxnet3/vmxnet3_ethtool.c index d9eee05558fa..49b2b3fb46fa 100644 --- a/drivers/net/vmxnet3/vmxnet3_ethtool.c +++ b/drivers/net/vmxnet3/vmxnet3_ethtool.c @@ -590,10 +590,11 @@ vmxnet3_get_link_ksettings(struct net_device *netdev, return 0; }
- static void vmxnet3_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *param) + struct ethtool_ringparam *param, + struct kernel_ethtool_ringparam *kernel_param, + struct netlink_ext_ack *extack) { struct vmxnet3_adapter *adapter = netdev_priv(netdev);
@@ -610,10 +611,11 @@ vmxnet3_get_ringparam(struct net_device *netdev, param->rx_jumbo_pending = adapter->rx_ring2_size; }
- static int vmxnet3_set_ringparam(struct net_device *netdev, - struct ethtool_ringparam *param) + struct ethtool_ringparam *param, + struct kernel_ethtool_ringparam *kernel_param, + struct netlink_ext_ack *extack) { struct vmxnet3_adapter *adapter = netdev_priv(netdev); u32 new_tx_ring_size, new_rx_ring_size, new_rx_ring2_size; diff --git a/drivers/s390/net/qeth_ethtool.c b/drivers/s390/net/qeth_ethtool.c index 72f3fa404e13..93012226da28 100644 --- a/drivers/s390/net/qeth_ethtool.c +++ b/drivers/s390/net/qeth_ethtool.c @@ -145,7 +145,9 @@ static int qeth_set_coalesce(struct net_device *dev, }
static void qeth_get_ringparam(struct net_device *dev, - struct ethtool_ringparam *param) + struct ethtool_ringparam *param, + struct kernel_ethtool_ringparam *kernel_param, + struct netlink_ext_ack *extack) { struct qeth_card *card = dev->ml_priv;
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 0bce31a7a508..0074c2e12cb5 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -470,9 +470,13 @@ struct ethtool_ops { struct kernel_ethtool_coalesce *, struct netlink_ext_ack *); void (*get_ringparam)(struct net_device *, - struct ethtool_ringparam *); + struct ethtool_ringparam *, + struct kernel_ethtool_ringparam *, + struct netlink_ext_ack *); int (*set_ringparam)(struct net_device *, - struct ethtool_ringparam *); + struct ethtool_ringparam *, + struct kernel_ethtool_ringparam *, + struct netlink_ext_ack *); void (*get_pause_stats)(struct net_device *dev, struct ethtool_pause_stats *pause_stats); void (*get_pauseparam)(struct net_device *, diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 4601889f7b60..ab056feb44a8 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1720,11 +1720,13 @@ static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev, static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr) { struct ethtool_ringparam ringparam = { .cmd = ETHTOOL_GRINGPARAM }; + struct kernel_ethtool_ringparam kernel_ringparam = {};
if (!dev->ethtool_ops->get_ringparam) return -EOPNOTSUPP;
- dev->ethtool_ops->get_ringparam(dev, &ringparam); + dev->ethtool_ops->get_ringparam(dev, &ringparam, + &kernel_ringparam, NULL);
if (copy_to_user(useraddr, &ringparam, sizeof(ringparam))) return -EFAULT; @@ -1734,6 +1736,7 @@ static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr) static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr) { struct ethtool_ringparam ringparam, max = { .cmd = ETHTOOL_GRINGPARAM }; + struct kernel_ethtool_ringparam kernel_ringparam; int ret;
if (!dev->ethtool_ops->set_ringparam || !dev->ethtool_ops->get_ringparam) @@ -1742,7 +1745,7 @@ static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr) if (copy_from_user(&ringparam, useraddr, sizeof(ringparam))) return -EFAULT;
- dev->ethtool_ops->get_ringparam(dev, &max); + dev->ethtool_ops->get_ringparam(dev, &max, &kernel_ringparam, NULL);
/* ensure new ring parameters are within the maximums */ if (ringparam.rx_pending > max.rx_max_pending || @@ -1751,7 +1754,8 @@ static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr) ringparam.tx_pending > max.tx_max_pending) return -EINVAL;
- ret = dev->ethtool_ops->set_ringparam(dev, &ringparam); + ret = dev->ethtool_ops->set_ringparam(dev, &ringparam, + &kernel_ringparam, NULL); if (!ret) ethtool_notify(dev, ETHTOOL_MSG_RINGS_NTF, NULL); return ret; diff --git a/net/ethtool/rings.c b/net/ethtool/rings.c index bd8e9a7530eb..450b8866373d 100644 --- a/net/ethtool/rings.c +++ b/net/ethtool/rings.c @@ -26,6 +26,7 @@ static int rings_prepare_data(const struct ethnl_req_info *req_base, struct genl_info *info) { struct rings_reply_data *data = RINGS_REPDATA(reply_base); + struct netlink_ext_ack *extack = info ? info->extack : NULL; struct net_device *dev = reply_base->dev; int ret;
@@ -34,7 +35,8 @@ static int rings_prepare_data(const struct ethnl_req_info *req_base, ret = ethnl_ops_begin(dev); if (ret < 0) return ret; - dev->ethtool_ops->get_ringparam(dev, &data->ringparam); + dev->ethtool_ops->get_ringparam(dev, &data->ringparam, + &data->kernel_ringparam, extack); ethnl_ops_complete(dev);
return 0; @@ -142,7 +144,7 @@ int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info) ret = ethnl_ops_begin(dev); if (ret < 0) goto out_rtnl; - ops->get_ringparam(dev, &ringparam); + ops->get_ringparam(dev, &ringparam, &kernel_ringparam, info->extack);
ethnl_update_u32(&ringparam.rx_pending, tb[ETHTOOL_A_RINGS_RX], &mod); ethnl_update_u32(&ringparam.rx_mini_pending, @@ -183,7 +185,8 @@ int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info) goto out_ops; }
- ret = dev->ethtool_ops->set_ringparam(dev, &ringparam); + ret = dev->ethtool_ops->set_ringparam(dev, &ringparam, + &kernel_ringparam, info->extack); if (ret < 0) goto out_ops; ethtool_notify(dev, ETHTOOL_MSG_RINGS_NTF, NULL); diff --git a/net/mac80211/ethtool.c b/net/mac80211/ethtool.c index 99a2e30b3833..b2253df54413 100644 --- a/net/mac80211/ethtool.c +++ b/net/mac80211/ethtool.c @@ -14,7 +14,9 @@ #include "driver-ops.h"
static int ieee80211_set_ringparam(struct net_device *dev, - struct ethtool_ringparam *rp) + struct ethtool_ringparam *rp, + struct kernel_ethtool_ringparam *kernel_rp, + struct netlink_ext_ack *extack) { struct ieee80211_local *local = wiphy_priv(dev->ieee80211_ptr->wiphy);
@@ -25,7 +27,9 @@ static int ieee80211_set_ringparam(struct net_device *dev, }
static void ieee80211_get_ringparam(struct net_device *dev, - struct ethtool_ringparam *rp) + struct ethtool_ringparam *rp, + struct kernel_ethtool_ringparam *kernel_rp, + struct netlink_ext_ack *extack) { struct ieee80211_local *local = wiphy_priv(dev->ieee80211_ptr->wiphy);
From: Hao Chen chenhao288@hisilicon.com
mainline inclusion from mainline-master commit e65a0231d2ca category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Rx buf len is for rx BD buffer size, support setting it via ethtool -G parameter and getting it via ethtool -g parameter.
Signed-off-by: Hao Chen chenhao288@hisilicon.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../ethernet/hisilicon/hns3/hns3_ethtool.c | 52 ++++++++++++++++--- 1 file changed, 44 insertions(+), 8 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index e35a2661b45d..6b49ecb4a6f3 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -649,7 +649,7 @@ static void hns3_get_ringparam(struct net_device *netdev, { struct hns3_nic_priv *priv = netdev_priv(netdev); struct hnae3_handle *h = priv->ae_handle; - int queue_num = h->kinfo.num_tqps; + int rx_queue_index = h->kinfo.num_tqps;
if (hns3_nic_resetting(netdev)) { netdev_err(netdev, "dev resetting!"); @@ -660,7 +660,8 @@ static void hns3_get_ringparam(struct net_device *netdev, param->rx_max_pending = HNS3_RING_MAX_PENDING;
param->tx_pending = priv->ring[0].desc_num; - param->rx_pending = priv->ring[queue_num].desc_num; + param->rx_pending = priv->ring[rx_queue_index].desc_num; + kernel_param->rx_buf_len = priv->ring[rx_queue_index].buf_size; }
static void hns3_get_pauseparam(struct net_device *netdev, @@ -1062,14 +1063,23 @@ static struct hns3_enet_ring *hns3_backup_ringparam(struct hns3_nic_priv *priv) }
static int hns3_check_ringparam(struct net_device *ndev, - struct ethtool_ringparam *param) + struct ethtool_ringparam *param, + struct kernel_ethtool_ringparam *kernel_param) { +#define RX_BUF_LEN_2K 2048 +#define RX_BUF_LEN_4K 4096 if (hns3_nic_resetting(ndev)) return -EBUSY;
if (param->rx_mini_pending || param->rx_jumbo_pending) return -EINVAL;
+ if (kernel_param->rx_buf_len != RX_BUF_LEN_2K && + kernel_param->rx_buf_len != RX_BUF_LEN_4K) { + netdev_err(ndev, "Rx buf len only support 2048 and 4096\n"); + return -EINVAL; + } + if (param->tx_pending > HNS3_RING_MAX_PENDING || param->tx_pending < HNS3_RING_MIN_PENDING || param->rx_pending > HNS3_RING_MAX_PENDING || @@ -1082,6 +1092,22 @@ static int hns3_check_ringparam(struct net_device *ndev, return 0; }
+static int hns3_change_rx_buf_len(struct net_device *ndev, u32 rx_buf_len) +{ + struct hns3_nic_priv *priv = netdev_priv(ndev); + struct hnae3_handle *h = priv->ae_handle; + int i; + + h->kinfo.rx_buf_len = rx_buf_len; + + for (i = 0; i < h->kinfo.num_tqps; i++) { + h->kinfo.tqp[i]->buf_size = rx_buf_len; + priv->ring[i + h->kinfo.num_tqps].buf_size = rx_buf_len; + } + + return 0; +} + static int hns3_set_ringparam(struct net_device *ndev, struct ethtool_ringparam *param, struct kernel_ethtool_ringparam *kernel_param, @@ -1094,9 +1120,10 @@ static int hns3_set_ringparam(struct net_device *ndev, u32 old_tx_desc_num, new_tx_desc_num; u32 old_rx_desc_num, new_rx_desc_num; u16 queue_num = h->kinfo.num_tqps; + u32 old_rx_buf_len; int ret, i;
- ret = hns3_check_ringparam(ndev, param); + ret = hns3_check_ringparam(ndev, param, kernel_param); if (ret) return ret;
@@ -1105,8 +1132,10 @@ static int hns3_set_ringparam(struct net_device *ndev, new_rx_desc_num = ALIGN(param->rx_pending, HNS3_RING_BD_MULTIPLE); old_tx_desc_num = priv->ring[0].desc_num; old_rx_desc_num = priv->ring[queue_num].desc_num; + old_rx_buf_len = priv->ring[queue_num].buf_size; if (old_tx_desc_num == new_tx_desc_num && - old_rx_desc_num == new_rx_desc_num) + old_rx_desc_num == new_rx_desc_num && + kernel_param->rx_buf_len == old_rx_buf_len) return 0;
tmp_rings = hns3_backup_ringparam(priv); @@ -1117,19 +1146,22 @@ static int hns3_set_ringparam(struct net_device *ndev, }
netdev_info(ndev, - "Changing Tx/Rx ring depth from %u/%u to %u/%u\n", + "Changing Tx/Rx ring depth from %u/%u to %u/%u, Changing rx buffer len from %d to %d\n", old_tx_desc_num, old_rx_desc_num, - new_tx_desc_num, new_rx_desc_num); + new_tx_desc_num, new_rx_desc_num, + old_rx_buf_len, kernel_param->rx_buf_len);
if (if_running) ndev->netdev_ops->ndo_stop(ndev);
hns3_change_all_ring_bd_num(priv, new_tx_desc_num, new_rx_desc_num); + hns3_change_rx_buf_len(ndev, kernel_param->rx_buf_len); ret = hns3_init_all_ring(priv); if (ret) { - netdev_err(ndev, "Change bd num fail, revert to old value(%d)\n", + netdev_err(ndev, "set ringparam fail, revert to old value(%d)\n", ret);
+ hns3_change_rx_buf_len(ndev, old_rx_buf_len); hns3_change_all_ring_bd_num(priv, old_tx_desc_num, old_rx_desc_num); for (i = 0; i < h->kinfo.num_tqps * 2; i++) @@ -1811,6 +1843,8 @@ static int hns3_set_tunable(struct net_device *netdev, ETHTOOL_COALESCE_MAX_FRAMES | \ ETHTOOL_COALESCE_USE_CQE)
+#define HNS3_ETHTOOL_RING ETHTOOL_RING_USE_RX_BUF_LEN + static int hns3_get_ts_info(struct net_device *netdev, struct ethtool_ts_info *info) { @@ -1889,6 +1923,7 @@ static int hns3_get_link_ext_state(struct net_device *netdev,
static const struct ethtool_ops hns3vf_ethtool_ops = { .supported_coalesce_params = HNS3_ETHTOOL_COALESCE, + .supported_ring_params = HNS3_ETHTOOL_RING, .get_drvinfo = hns3_get_drvinfo, .get_ringparam = hns3_get_ringparam, .set_ringparam = hns3_set_ringparam, @@ -1920,6 +1955,7 @@ static const struct ethtool_ops hns3vf_ethtool_ops = {
static const struct ethtool_ops hns3_ethtool_ops = { .supported_coalesce_params = HNS3_ETHTOOL_COALESCE, + .supported_ring_params = HNS3_ETHTOOL_RING, .self_test = hns3_self_test, .get_drvinfo = hns3_get_drvinfo, .get_link = hns3_get_link,
From: Hao Chen chenhao288@hisilicon.com
mainline inclusion from mainline-master commit e175eb5fb054 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
The way to set tx spare buf via module parameter is not such convenient as the way to set it via ethtool.
So,remove the way to set tx spare buf via module parameter.
Signed-off-by: Hao Chen chenhao288@hisilicon.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index cb0d5381482e..e7183548b91d 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -53,10 +53,6 @@ static int debug = -1; module_param(debug, int, 0); MODULE_PARM_DESC(debug, " Network interface message level setting");
-static unsigned int tx_spare_buf_size; -module_param(tx_spare_buf_size, uint, 0400); -MODULE_PARM_DESC(tx_spare_buf_size, "Size used to allocate tx spare buffer"); - static unsigned int tx_sgl = 1; module_param(tx_sgl, uint, 0600); MODULE_PARM_DESC(tx_sgl, "Minimum number of frags when using dma_map_sg() to optimize the IOMMU mapping"); @@ -1041,8 +1037,7 @@ static void hns3_init_tx_spare_buffer(struct hns3_enet_ring *ring) dma_addr_t dma; int order;
- alloc_size = tx_spare_buf_size ? tx_spare_buf_size : - ring->tqp->handle->kinfo.tx_spare_buf_size; + alloc_size = ring->tqp->handle->kinfo.tx_spare_buf_size; if (!alloc_size) return;
From: Yufeng Mo moyufeng@huawei.com
mainline inclusion from mainline-master commit d9069dab2075 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
When the mbx or reset message arrives, the driver is informed through an interrupt. This task can be processed only after the workqueue is scheduled. In some cases, this workqueue scheduling takes a long time. As a result, the mbx or reset service task cannot be processed in time. So add some warning message to improve debugging efficiency for this case.
Signed-off-by: Yufeng Mo moyufeng@huawei.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../net/ethernet/hisilicon/hns3/hclge_mbx.h | 3 +++ .../hisilicon/hns3/hns3pf/hclge_main.c | 22 +++++++++++++++++-- .../hisilicon/hns3/hns3pf/hclge_main.h | 2 ++ .../hisilicon/hns3/hns3pf/hclge_mbx.c | 8 +++++++ 4 files changed, 33 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hclge_mbx.h b/drivers/net/ethernet/hisilicon/hns3/hclge_mbx.h index c2bd2584201f..b668df6193be 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hclge_mbx.h +++ b/drivers/net/ethernet/hisilicon/hns3/hclge_mbx.h @@ -80,6 +80,9 @@ enum hclge_mbx_tbl_cfg_subcode { #define HCLGE_MBX_MAX_RESP_DATA_SIZE 8U #define HCLGE_MBX_MAX_RING_CHAIN_PARAM_NUM 4
+#define HCLGE_RESET_SCHED_TIMEOUT (3 * HZ) +#define HCLGE_MBX_SCHED_TIMEOUT (HZ / 2) + struct hclge_ring_chain_param { u8 ring_type; u8 tqp_index; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 6d99170756c3..df3262bf498d 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -2933,16 +2933,20 @@ static int hclge_mac_init(struct hclge_dev *hdev) static void hclge_mbx_task_schedule(struct hclge_dev *hdev) { if (!test_bit(HCLGE_STATE_REMOVING, &hdev->state) && - !test_and_set_bit(HCLGE_STATE_MBX_SERVICE_SCHED, &hdev->state)) + !test_and_set_bit(HCLGE_STATE_MBX_SERVICE_SCHED, &hdev->state)) { + hdev->last_mbx_scheduled = jiffies; mod_delayed_work(hclge_wq, &hdev->service_task, 0); + } }
static void hclge_reset_task_schedule(struct hclge_dev *hdev) { if (!test_bit(HCLGE_STATE_REMOVING, &hdev->state) && test_bit(HCLGE_STATE_SERVICE_INITED, &hdev->state) && - !test_and_set_bit(HCLGE_STATE_RST_SERVICE_SCHED, &hdev->state)) + !test_and_set_bit(HCLGE_STATE_RST_SERVICE_SCHED, &hdev->state)) { + hdev->last_rst_scheduled = jiffies; mod_delayed_work(hclge_wq, &hdev->service_task, 0); + } }
static void hclge_errhand_task_schedule(struct hclge_dev *hdev) @@ -3831,6 +3835,13 @@ static void hclge_mailbox_service_task(struct hclge_dev *hdev) test_and_set_bit(HCLGE_STATE_MBX_HANDLING, &hdev->state)) return;
+ if (time_is_before_jiffies(hdev->last_mbx_scheduled + + HCLGE_MBX_SCHED_TIMEOUT)) + dev_warn(&hdev->pdev->dev, + "mbx service task is scheduled after %ums on cpu%u!\n", + jiffies_to_msecs(jiffies - hdev->last_mbx_scheduled), + smp_processor_id()); + hclge_mbx_handler(hdev);
clear_bit(HCLGE_STATE_MBX_HANDLING, &hdev->state); @@ -4480,6 +4491,13 @@ static void hclge_reset_service_task(struct hclge_dev *hdev) if (!test_and_clear_bit(HCLGE_STATE_RST_SERVICE_SCHED, &hdev->state)) return;
+ if (time_is_before_jiffies(hdev->last_rst_scheduled + + HCLGE_RESET_SCHED_TIMEOUT)) + dev_warn(&hdev->pdev->dev, + "reset service task is scheduled after %ums on cpu%u!\n", + jiffies_to_msecs(jiffies - hdev->last_rst_scheduled), + smp_processor_id()); + down(&hdev->reset_sem); set_bit(HCLGE_STATE_RST_HANDLING, &hdev->state);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index ebba603483a0..42ce1eee33c4 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -955,6 +955,8 @@ struct hclge_dev { u16 hclge_fd_rule_num; unsigned long serv_processed_cnt; unsigned long last_serv_processed; + unsigned long last_rst_scheduled; + unsigned long last_mbx_scheduled; unsigned long fd_bmap[BITS_TO_LONGS(MAX_FD_FILTER_NUM)]; enum HCLGE_FD_ACTIVE_RULE_TYPE fd_active_type; u8 fd_en; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c index 65d78ee4d65a..c495df2e5953 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c @@ -848,6 +848,14 @@ void hclge_mbx_handler(struct hclge_dev *hdev) if (hnae3_get_bit(req->mbx_need_resp, HCLGE_MBX_NEED_RESP_B) && req->msg.code < HCLGE_MBX_GET_VF_FLR_STATUS) { resp_msg.status = ret; + if (time_is_before_jiffies(hdev->last_mbx_scheduled + + HCLGE_MBX_SCHED_TIMEOUT)) + dev_warn(&hdev->pdev->dev, + "resp vport%u mbx(%u,%u) late\n", + req->mbx_src_vfid, + req->msg.code, + req->msg.subcode); + hclge_gen_resp_to_vf(vport, req, &resp_msg); }
From: Yufeng Mo moyufeng@huawei.com
mainline inclusion from mainline-master commit 4f331fda35f1 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Printing the whole MAC addresse may bring security risks. Therefore, the MAC address is partially encrypted to improve security.
Signed-off-by: Yufeng Mo moyufeng@huawei.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com (fix conflicts: replace eth_hw_addr_set by ether_addr_copy) Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 14 +++++ .../net/ethernet/hisilicon/hns3/hns3_enet.c | 29 +++++++--- .../hisilicon/hns3/hns3pf/hclge_main.c | 55 ++++++++++++------- .../hisilicon/hns3/hns3vf/hclgevf_main.c | 7 ++- 4 files changed, 74 insertions(+), 31 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index 785d1f1f2a08..a2ca47757ef9 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -858,6 +858,20 @@ struct hnae3_handle { #define hnae3_get_bit(origin, shift) \ hnae3_get_field(origin, 0x1 << (shift), shift)
+#define HNAE3_FORMAT_MAC_ADDR_LEN 18 +#define HNAE3_FORMAT_MAC_ADDR_OFFSET_0 0 +#define HNAE3_FORMAT_MAC_ADDR_OFFSET_4 4 +#define HNAE3_FORMAT_MAC_ADDR_OFFSET_5 5 + +static inline void hnae3_format_mac_addr(char *format_mac_addr, + const u8 *mac_addr) +{ + snprintf(format_mac_addr, HNAE3_FORMAT_MAC_ADDR_LEN, "%02x:**:**:**:%02x:%02x", + mac_addr[HNAE3_FORMAT_MAC_ADDR_OFFSET_0], + mac_addr[HNAE3_FORMAT_MAC_ADDR_OFFSET_4], + mac_addr[HNAE3_FORMAT_MAC_ADDR_OFFSET_5]); +} + int hnae3_register_ae_dev(struct hnae3_ae_dev *ae_dev); void hnae3_unregister_ae_dev(struct hnae3_ae_dev *ae_dev);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index e7183548b91d..c52ca43ffabb 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -2250,6 +2250,8 @@ netdev_tx_t hns3_nic_net_xmit(struct sk_buff *skb, struct net_device *netdev)
static int hns3_nic_net_set_mac_address(struct net_device *netdev, void *p) { + char format_mac_addr_perm[HNAE3_FORMAT_MAC_ADDR_LEN]; + char format_mac_addr_sa[HNAE3_FORMAT_MAC_ADDR_LEN]; struct hnae3_handle *h = hns3_get_handle(netdev); struct sockaddr *mac_addr = p; int ret; @@ -2258,8 +2260,9 @@ static int hns3_nic_net_set_mac_address(struct net_device *netdev, void *p) return -EADDRNOTAVAIL;
if (ether_addr_equal(netdev->dev_addr, mac_addr->sa_data)) { - netdev_info(netdev, "already using mac address %pM\n", - mac_addr->sa_data); + hnae3_format_mac_addr(format_mac_addr_sa, mac_addr->sa_data); + netdev_info(netdev, "already using mac address %s\n", + format_mac_addr_sa); return 0; }
@@ -2268,8 +2271,10 @@ static int hns3_nic_net_set_mac_address(struct net_device *netdev, void *p) */ if (!hns3_is_phys_func(h->pdev) && !is_zero_ether_addr(netdev->perm_addr)) { - netdev_err(netdev, "has permanent MAC %pM, user MAC %pM not allow\n", - netdev->perm_addr, mac_addr->sa_data); + hnae3_format_mac_addr(format_mac_addr_perm, netdev->perm_addr); + hnae3_format_mac_addr(format_mac_addr_sa, mac_addr->sa_data); + netdev_err(netdev, "has permanent MAC %s, user MAC %s not allow\n", + format_mac_addr_perm, format_mac_addr_sa); return -EPERM; }
@@ -2831,14 +2836,16 @@ static int hns3_nic_set_vf_rate(struct net_device *ndev, int vf, static int hns3_nic_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac) { struct hnae3_handle *h = hns3_get_handle(netdev); + char format_mac_addr[HNAE3_FORMAT_MAC_ADDR_LEN];
if (!h->ae_algo->ops->set_vf_mac) return -EOPNOTSUPP;
if (is_multicast_ether_addr(mac)) { + hnae3_format_mac_addr(format_mac_addr, mac); netdev_err(netdev, - "Invalid MAC:%pM specified. Could not set MAC\n", - mac); + "Invalid MAC:%s specified. Could not set MAC\n", + format_mac_addr); return -EINVAL; }
@@ -4929,6 +4936,7 @@ static void hns3_uninit_all_ring(struct hns3_nic_priv *priv) static int hns3_init_mac_addr(struct net_device *netdev) { struct hns3_nic_priv *priv = netdev_priv(netdev); + char format_mac_addr[HNAE3_FORMAT_MAC_ADDR_LEN]; struct hnae3_handle *h = priv->ae_handle; u8 mac_addr_temp[ETH_ALEN]; int ret = 0; @@ -4939,8 +4947,9 @@ static int hns3_init_mac_addr(struct net_device *netdev) /* Check if the MAC address is valid, if not get a random one */ if (!is_valid_ether_addr(mac_addr_temp)) { eth_hw_addr_random(netdev); - dev_warn(priv->dev, "using random MAC address %pM\n", - netdev->dev_addr); + hnae3_format_mac_addr(format_mac_addr, netdev->dev_addr); + dev_warn(priv->dev, "using random MAC address %s\n", + format_mac_addr); } else if (!ether_addr_equal(netdev->dev_addr, mac_addr_temp)) { ether_addr_copy(netdev->dev_addr, mac_addr_temp); ether_addr_copy(netdev->perm_addr, mac_addr_temp); @@ -4992,8 +5001,10 @@ static void hns3_client_stop(struct hnae3_handle *handle) static void hns3_info_show(struct hns3_nic_priv *priv) { struct hnae3_knic_private_info *kinfo = &priv->ae_handle->kinfo; + char format_mac_addr[HNAE3_FORMAT_MAC_ADDR_LEN];
- dev_info(priv->dev, "MAC address: %pM\n", priv->netdev->dev_addr); + hnae3_format_mac_addr(format_mac_addr, priv->netdev->dev_addr); + dev_info(priv->dev, "MAC address: %s\n", format_mac_addr); dev_info(priv->dev, "Task queue pairs numbers: %u\n", kinfo->num_tqps); dev_info(priv->dev, "RSS size: %u\n", kinfo->rss_size); dev_info(priv->dev, "Allocated RSS size: %u\n", kinfo->req_rss_size); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index df3262bf498d..ff5baeef5204 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -8761,6 +8761,7 @@ int hclge_update_mac_list(struct hclge_vport *vport, enum HCLGE_MAC_ADDR_TYPE mac_type, const unsigned char *addr) { + char format_mac_addr[HNAE3_FORMAT_MAC_ADDR_LEN]; struct hclge_dev *hdev = vport->back; struct hclge_mac_node *mac_node; struct list_head *list; @@ -8785,9 +8786,10 @@ int hclge_update_mac_list(struct hclge_vport *vport, /* if this address is never added, unnecessary to delete */ if (state == HCLGE_MAC_TO_DEL) { spin_unlock_bh(&vport->mac_list_lock); + hnae3_format_mac_addr(format_mac_addr, addr); dev_err(&hdev->pdev->dev, - "failed to delete address %pM from mac list\n", - addr); + "failed to delete address %s from mac list\n", + format_mac_addr); return -ENOENT; }
@@ -8820,6 +8822,7 @@ static int hclge_add_uc_addr(struct hnae3_handle *handle, int hclge_add_uc_addr_common(struct hclge_vport *vport, const unsigned char *addr) { + char format_mac_addr[HNAE3_FORMAT_MAC_ADDR_LEN]; struct hclge_dev *hdev = vport->back; struct hclge_mac_vlan_tbl_entry_cmd req; struct hclge_desc desc; @@ -8830,9 +8833,10 @@ int hclge_add_uc_addr_common(struct hclge_vport *vport, if (is_zero_ether_addr(addr) || is_broadcast_ether_addr(addr) || is_multicast_ether_addr(addr)) { + hnae3_format_mac_addr(format_mac_addr, addr); dev_err(&hdev->pdev->dev, - "Set_uc mac err! invalid mac:%pM. is_zero:%d,is_br=%d,is_mul=%d\n", - addr, is_zero_ether_addr(addr), + "Set_uc mac err! invalid mac:%s. is_zero:%d,is_br=%d,is_mul=%d\n", + format_mac_addr, is_zero_ether_addr(addr), is_broadcast_ether_addr(addr), is_multicast_ether_addr(addr)); return -EINVAL; @@ -8889,6 +8893,7 @@ static int hclge_rm_uc_addr(struct hnae3_handle *handle, int hclge_rm_uc_addr_common(struct hclge_vport *vport, const unsigned char *addr) { + char format_mac_addr[HNAE3_FORMAT_MAC_ADDR_LEN]; struct hclge_dev *hdev = vport->back; struct hclge_mac_vlan_tbl_entry_cmd req; int ret; @@ -8897,8 +8902,9 @@ int hclge_rm_uc_addr_common(struct hclge_vport *vport, if (is_zero_ether_addr(addr) || is_broadcast_ether_addr(addr) || is_multicast_ether_addr(addr)) { - dev_dbg(&hdev->pdev->dev, "Remove mac err! invalid mac:%pM.\n", - addr); + hnae3_format_mac_addr(format_mac_addr, addr); + dev_dbg(&hdev->pdev->dev, "Remove mac err! invalid mac:%s.\n", + format_mac_addr); return -EINVAL; }
@@ -8929,6 +8935,7 @@ static int hclge_add_mc_addr(struct hnae3_handle *handle, int hclge_add_mc_addr_common(struct hclge_vport *vport, const unsigned char *addr) { + char format_mac_addr[HNAE3_FORMAT_MAC_ADDR_LEN]; struct hclge_dev *hdev = vport->back; struct hclge_mac_vlan_tbl_entry_cmd req; struct hclge_desc desc[3]; @@ -8937,9 +8944,10 @@ int hclge_add_mc_addr_common(struct hclge_vport *vport,
/* mac addr check */ if (!is_multicast_ether_addr(addr)) { + hnae3_format_mac_addr(format_mac_addr, addr); dev_err(&hdev->pdev->dev, - "Add mc mac err! invalid mac:%pM.\n", - addr); + "Add mc mac err! invalid mac:%s.\n", + format_mac_addr); return -EINVAL; } memset(&req, 0, sizeof(req)); @@ -8991,6 +8999,7 @@ static int hclge_rm_mc_addr(struct hnae3_handle *handle, int hclge_rm_mc_addr_common(struct hclge_vport *vport, const unsigned char *addr) { + char format_mac_addr[HNAE3_FORMAT_MAC_ADDR_LEN]; struct hclge_dev *hdev = vport->back; struct hclge_mac_vlan_tbl_entry_cmd req; enum hclge_cmd_status status; @@ -8998,9 +9007,10 @@ int hclge_rm_mc_addr_common(struct hclge_vport *vport,
/* mac addr check */ if (!is_multicast_ether_addr(addr)) { + hnae3_format_mac_addr(format_mac_addr, addr); dev_dbg(&hdev->pdev->dev, - "Remove mc mac err! invalid mac:%pM.\n", - addr); + "Remove mc mac err! invalid mac:%s.\n", + format_mac_addr); return -EINVAL; }
@@ -9440,16 +9450,18 @@ static int hclge_set_vf_mac(struct hnae3_handle *handle, int vf, u8 *mac_addr) { struct hclge_vport *vport = hclge_get_vport(handle); + char format_mac_addr[HNAE3_FORMAT_MAC_ADDR_LEN]; struct hclge_dev *hdev = vport->back;
vport = hclge_get_vf_vport(hdev, vf); if (!vport) return -EINVAL;
+ hnae3_format_mac_addr(format_mac_addr, mac_addr); if (ether_addr_equal(mac_addr, vport->vf_info.mac)) { dev_info(&hdev->pdev->dev, - "Specified MAC(=%pM) is same as before, no change committed!\n", - mac_addr); + "Specified MAC(=%s) is same as before, no change committed!\n", + format_mac_addr); return 0; }
@@ -9457,13 +9469,13 @@ static int hclge_set_vf_mac(struct hnae3_handle *handle, int vf,
if (test_bit(HCLGE_VPORT_STATE_ALIVE, &vport->state)) { dev_info(&hdev->pdev->dev, - "MAC of VF %d has been set to %pM, and it will be reinitialized!\n", - vf, mac_addr); + "MAC of VF %d has been set to %s, and it will be reinitialized!\n", + vf, format_mac_addr); return hclge_inform_reset_assert_to_vf(vport); }
- dev_info(&hdev->pdev->dev, "MAC of VF %d has been set to %pM\n", - vf, mac_addr); + dev_info(&hdev->pdev->dev, "MAC of VF %d has been set to %s\n", + vf, format_mac_addr); return 0; }
@@ -9567,6 +9579,7 @@ static int hclge_set_mac_addr(struct hnae3_handle *handle, void *p, { const unsigned char *new_addr = (const unsigned char *)p; struct hclge_vport *vport = hclge_get_vport(handle); + char format_mac_addr[HNAE3_FORMAT_MAC_ADDR_LEN]; struct hclge_dev *hdev = vport->back; unsigned char *old_addr = NULL; int ret; @@ -9575,9 +9588,10 @@ static int hclge_set_mac_addr(struct hnae3_handle *handle, void *p, if (is_zero_ether_addr(new_addr) || is_broadcast_ether_addr(new_addr) || is_multicast_ether_addr(new_addr)) { + hnae3_format_mac_addr(format_mac_addr, new_addr); dev_err(&hdev->pdev->dev, - "change uc mac err! invalid mac: %pM.\n", - new_addr); + "change uc mac err! invalid mac: %s.\n", + format_mac_addr); return -EINVAL; }
@@ -9595,9 +9609,10 @@ static int hclge_set_mac_addr(struct hnae3_handle *handle, void *p, spin_lock_bh(&vport->mac_list_lock); ret = hclge_update_mac_node_for_dev_addr(vport, old_addr, new_addr); if (ret) { + hnae3_format_mac_addr(format_mac_addr, new_addr); dev_err(&hdev->pdev->dev, - "failed to change the mac addr:%pM, ret = %d\n", - new_addr, ret); + "failed to change the mac addr:%s, ret = %d\n", + format_mac_addr, ret); spin_unlock_bh(&vport->mac_list_lock);
if (!is_first) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index 3b8bde58613a..a069b99126e6 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -1514,15 +1514,18 @@ static void hclgevf_config_mac_list(struct hclgevf_dev *hdev, struct list_head *list, enum HCLGEVF_MAC_ADDR_TYPE mac_type) { + char format_mac_addr[HNAE3_FORMAT_MAC_ADDR_LEN]; struct hclgevf_mac_addr_node *mac_node, *tmp; int ret;
list_for_each_entry_safe(mac_node, tmp, list, node) { ret = hclgevf_add_del_mac_addr(hdev, mac_node, mac_type); if (ret) { + hnae3_format_mac_addr(format_mac_addr, + mac_node->mac_addr); dev_err(&hdev->pdev->dev, - "failed to configure mac %pM, state = %d, ret = %d\n", - mac_node->mac_addr, mac_node->state, ret); + "failed to configure mac %s, state = %d, ret = %d\n", + format_mac_addr, mac_node->state, ret); return; } if (mac_node->state == HCLGEVF_MAC_TO_ADD) {
From: Jie Wang wangjie125@huawei.com
mainline inclusion from mainline-master commit 8488e3c68214 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Currently, there is no way to get drop packet number of multicast and broadcast in IGU hardware module, it is not convenient to find problem when multicast packet or broadcast packet is dropped in IGU, so this patch adds statistics for them in debugfs.
Signed-off-by: Jie Wang wangjie125@huawei.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.h index c526591a7240..5b6018204f92 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.h @@ -321,10 +321,10 @@ static const struct hclge_dbg_dfx_message hclge_dbg_igu_egu_reg[] = { {true, "IGU_RX_OUT_UDP0_PKT"},
{true, "IGU_RX_IN_UDP0_PKT"}, - {false, "Reserved"}, - {false, "Reserved"}, - {false, "Reserved"}, - {false, "Reserved"}, + {true, "IGU_MC_CAR_DROP_PKT_L"}, + {true, "IGU_MC_CAR_DROP_PKT_H"}, + {true, "IGU_BC_CAR_DROP_PKT_L"}, + {true, "IGU_BC_CAR_DROP_PKT_H"}, {false, "Reserved"},
{true, "IGU_RX_OVERSIZE_PKT_L"},
From: Yufeng Mo moyufeng@huawei.com
mainline inclusion from mainline-master commit db596298edbf category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
When tx timeout occurs, the info of dql maybe helpful, so print these info to hns3_get_tx_timeo_queue_info().
Signed-off-by: Yufeng Mo moyufeng@huawei.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 7 +++++++ 1 file changed, 7 insertions(+)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index c52ca43ffabb..447e9719b528 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -2682,6 +2682,13 @@ static bool hns3_get_tx_timeo_queue_info(struct net_device *ndev) if (netif_xmit_stopped(q) && time_after(jiffies, (trans_start + ndev->watchdog_timeo))) { +#ifdef CONFIG_BQL + struct dql *dql = &q->dql; + + netdev_info(ndev, "DQL info last_cnt: %u, queued: %u, adj_limit: %u, completed: %u\n", + dql->last_obj_cnt, dql->num_queued, + dql->adj_limit, dql->num_completed); +#endif timeout_queue = i; netdev_info(ndev, "queue state: 0x%lx, delta msecs: %u\n", q->state,
From: Guangbin Huang huangguangbin2@huawei.com
mainline inclusion from mainline-v5.16-rc3 commit 8d2ad993aa05 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
When PF is set to multi-TCs and configured mapping relationship between priorities and TCs, the hardware will active these settings for this PF and its VFs.
In this case when VF just uses one TC and its rx packets contain priority, and if the priority is not mapped to TC0, as other TCs of VF is not valid, hardware always put this kind of packets to the queue 0. It cause this kind of packets of VF can not be used RSS function.
To fix this problem, set tc mode of all unused TCs of VF to the setting of TC0, then rx packet with priority which map to unused TC will be direct to TC0.
Fixes: e2cb1dec9779 ("net: hns3: Add HNS3 VF HCL(Hardware Compatibility Layer) Support") Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: Jakub Kicinski kuba@kernel.org Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index a069b99126e6..d7a7c8240bb2 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -703,9 +703,9 @@ static int hclgevf_set_rss_tc_mode(struct hclgevf_dev *hdev, u16 rss_size) roundup_size = ilog2(roundup_size);
for (i = 0; i < HCLGEVF_MAX_TC_NUM; i++) { - tc_valid[i] = !!(hdev->hw_tc_map & BIT(i)); + tc_valid[i] = 1; tc_size[i] = roundup_size; - tc_offset[i] = rss_size * i; + tc_offset[i] = (hdev->hw_tc_map & BIT(i)) ? rss_size * i : 0; }
hclgevf_cmd_setup_basic_desc(&desc, HCLGEVF_OPC_RSS_TC_MODE, false);
From: Hao Chen chenhao288@hisilicon.com
mainline inclusion from mainline-v5.16-rc3 commit b8af344cfea1 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
When page pool is not enabled, its address value is still NULL and page pool should not be accessed, so add a check for it.
Fixes: 850bfb912a6d ("net: hns3: debugfs add support dumping page pool info") Signed-off-by: Hao Chen chenhao288@hisilicon.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: Jakub Kicinski kuba@kernel.org Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c | 5 +++++ 1 file changed, 5 insertions(+)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c index 67364ab63a1f..fbb8a5f08222 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c @@ -1106,6 +1106,11 @@ hns3_dbg_page_pool_info(struct hnae3_handle *h, char *buf, int len) return -EFAULT; }
+ if (!priv->ring[h->kinfo.num_tqps].page_pool) { + dev_err(&h->pdev->dev, "page pool is not initialized\n"); + return -EFAULT; + } + for (i = 0; i < ARRAY_SIZE(page_pool_info_items); i++) result[i] = &data_str[i][0];
From: Hao Chen chenhao288@hisilicon.com
mainline inclusion from mainline-v5.16-rc3 commit 9c1479174870 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Currently, when user queries page pool info by debugfs command "cat page_pool_info", the cnt of allocated page for page pool may be incorrect because of memory inconsistency problem caused by compiler optimization.
So this patch uses READ_ONCE() to read value of pages_state_hold_cnt to fix this problem.
Fixes: 850bfb912a6d ("net: hns3: debugfs add support dumping page pool info") Signed-off-by: Hao Chen chenhao288@hisilicon.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: Jakub Kicinski kuba@kernel.org Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c index fbb8a5f08222..081295bff765 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c @@ -1081,7 +1081,8 @@ static void hns3_dump_page_pool_info(struct hns3_enet_ring *ring, u32 j = 0;
sprintf(result[j++], "%u", index); - sprintf(result[j++], "%u", ring->page_pool->pages_state_hold_cnt); + sprintf(result[j++], "%u", + READ_ONCE(ring->page_pool->pages_state_hold_cnt)); sprintf(result[j++], "%u", atomic_read(&ring->page_pool->pages_state_release_cnt)); sprintf(result[j++], "%u", ring->page_pool->p.pool_size);
From: Jie Wang wangjie125@huawei.com
mainline inclusion from mainline-v5.16-rc3 commit 82229c4dbb8a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Currently, HNS3 driver doesn't clear the reset flags of components after successfully executing reset, it causes userspace info of "Components reset" and "Components not reset" is incorrect.
So fix this problem by clear corresponding reset flag after reset process.
Fixes: ddccc5e368a3 ("net: hns3: add support for triggering reset by ethtool") Signed-off-by: Jie Wang wangjie125@huawei.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: Jakub Kicinski kuba@kernel.org Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index 6b49ecb4a6f3..c06c39ece80d 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -990,6 +990,7 @@ static int hns3_set_reset(struct net_device *netdev, u32 *flags) struct hnae3_ae_dev *ae_dev = pci_get_drvdata(h->pdev); const struct hnae3_ae_ops *ops = h->ae_algo->ops; const struct hns3_reset_type_map *rst_type_map; + enum ethtool_reset_flags rst_flags; u32 i, size;
if (ops->ae_dev_resetting && ops->ae_dev_resetting(h)) @@ -1009,6 +1010,7 @@ static int hns3_set_reset(struct net_device *netdev, u32 *flags) for (i = 0; i < size; i++) { if (rst_type_map[i].rst_flags == *flags) { rst_type = rst_type_map[i].rst_type; + rst_flags = rst_type_map[i].rst_flags; break; } } @@ -1024,6 +1026,8 @@ static int hns3_set_reset(struct net_device *netdev, u32 *flags)
ops->reset_event(h->pdev, h);
+ *flags &= ~rst_flags; + return 0; }
From: Hao Chen chenhao288@hisilicon.com
mainline inclusion from mainline-master commit ed618bd80947 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Add macro definition for number of IANA VXLAN-GPE port for generic use.
Signed-off-by: Hao Chen chenhao288@hisilicon.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/vxlan.c | 2 +- include/net/vxlan.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 48fbdce6a70e..c362a6ac94c4 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -3743,7 +3743,7 @@ static int vxlan_config_validate(struct net *src_net, struct vxlan_config *conf,
if (!conf->dst_port) { if (conf->flags & VXLAN_F_GPE) - conf->dst_port = htons(4790); /* IANA VXLAN-GPE port */ + conf->dst_port = htons(IANA_VXLAN_GPE_UDP_PORT); else conf->dst_port = htons(vxlan_port); } diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 08537aa14f7c..5a934bebe630 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -10,6 +10,7 @@ #include <net/nexthop.h>
#define IANA_VXLAN_UDP_PORT 4789 +#define IANA_VXLAN_GPE_UDP_PORT 4790
/* VXLAN protocol (RFC 7348) header: * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
From: Hao Chen chenhao288@hisilicon.com
mainline inclusion from mainline-master commit e54b708c5441 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
This patch uses macro IANA_VXLAN_GPE_UDP_PORT to replace number 4790 for cleanup.
Signed-off-by: Hao Chen chenhao288@hisilicon.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 447e9719b528..6920f59e2283 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -1301,7 +1301,7 @@ static bool hns3_tunnel_csum_bug(struct sk_buff *skb) if (!(!skb->encapsulation && (l4.udp->dest == htons(IANA_VXLAN_UDP_PORT) || l4.udp->dest == htons(GENEVE_UDP_PORT) || - l4.udp->dest == htons(4790)))) + l4.udp->dest == htons(IANA_VXLAN_GPE_UDP_PORT)))) return false;
return true;
From: Jiaran Zhang zhangjiaran@huawei.com
mainline inclusion from mainline-master commit ed0e658c51aa category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Currently, the hclge_reset_prepare_general function uses the goto statement to jump upwards, which increases code complexity and makes the program structure difficult to understand. In addition, if reset_pending is set, retry_cnt cannot be increased. This may result in a failure to exit the retry or increase the number of retries.
Use the while statement instead to make the program easier to understand and solve the problem that the goto statement cannot be exited.
Signed-off-by: Jiaran Zhang zhangjiaran@huawei.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../hisilicon/hns3/hns3pf/hclge_main.c | 32 ++++++++----------- .../hisilicon/hns3/hns3vf/hclgevf_main.c | 32 ++++++++----------- 2 files changed, 28 insertions(+), 36 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index ff5baeef5204..167391187ade 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -11589,24 +11589,20 @@ static void hclge_reset_prepare_general(struct hnae3_ae_dev *ae_dev, int retry_cnt = 0; int ret;
-retry: - down(&hdev->reset_sem); - set_bit(HCLGE_STATE_RST_HANDLING, &hdev->state); - hdev->reset_type = rst_type; - ret = hclge_reset_prepare(hdev); - if (ret || hdev->reset_pending) { - dev_err(&hdev->pdev->dev, "fail to prepare to reset, ret=%d\n", - ret); - if (hdev->reset_pending || - retry_cnt++ < HCLGE_RESET_RETRY_CNT) { - dev_err(&hdev->pdev->dev, - "reset_pending:0x%lx, retry_cnt:%d\n", - hdev->reset_pending, retry_cnt); - clear_bit(HCLGE_STATE_RST_HANDLING, &hdev->state); - up(&hdev->reset_sem); - msleep(HCLGE_RESET_RETRY_WAIT_MS); - goto retry; - } + while (retry_cnt++ < HCLGE_RESET_RETRY_CNT) { + down(&hdev->reset_sem); + set_bit(HCLGE_STATE_RST_HANDLING, &hdev->state); + hdev->reset_type = rst_type; + ret = hclge_reset_prepare(hdev); + if (!ret && !hdev->reset_pending) + break; + + dev_err(&hdev->pdev->dev, + "failed to prepare to reset, ret=%d, reset_pending:0x%lx, retry_cnt:%d\n", + ret, hdev->reset_pending, retry_cnt); + clear_bit(HCLGE_STATE_RST_HANDLING, &hdev->state); + up(&hdev->reset_sem); + msleep(HCLGE_RESET_RETRY_WAIT_MS); }
/* disable misc vector before reset done */ diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index d7a7c8240bb2..54ac7f21a9c6 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -2166,24 +2166,20 @@ static void hclgevf_reset_prepare_general(struct hnae3_ae_dev *ae_dev, int retry_cnt = 0; int ret;
-retry: - down(&hdev->reset_sem); - set_bit(HCLGEVF_STATE_RST_HANDLING, &hdev->state); - hdev->reset_type = rst_type; - ret = hclgevf_reset_prepare(hdev); - if (ret) { - dev_err(&hdev->pdev->dev, "fail to prepare to reset, ret=%d\n", - ret); - if (hdev->reset_pending || - retry_cnt++ < HCLGEVF_RESET_RETRY_CNT) { - dev_err(&hdev->pdev->dev, - "reset_pending:0x%lx, retry_cnt:%d\n", - hdev->reset_pending, retry_cnt); - clear_bit(HCLGEVF_STATE_RST_HANDLING, &hdev->state); - up(&hdev->reset_sem); - msleep(HCLGEVF_RESET_RETRY_WAIT_MS); - goto retry; - } + while (retry_cnt++ < HCLGEVF_RESET_RETRY_CNT) { + down(&hdev->reset_sem); + set_bit(HCLGEVF_STATE_RST_HANDLING, &hdev->state); + hdev->reset_type = rst_type; + ret = hclgevf_reset_prepare(hdev); + if (!ret && !hdev->reset_pending) + break; + + dev_err(&hdev->pdev->dev, + "failed to prepare to reset, ret=%d, reset_pending:0x%lx, retry_cnt:%d\n", + ret, hdev->reset_pending, retry_cnt); + clear_bit(HCLGEVF_STATE_RST_HANDLING, &hdev->state); + up(&hdev->reset_sem); + msleep(HCLGEVF_RESET_RETRY_WAIT_MS); }
/* disable misc vector before reset done */
From: Hao Chen chenhao288@hisilicon.com
mainline inclusion from mainline-master commit e74a726da2c4 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Split rx copybreak handle into a separate function from function hns3_nic_reuse_page() to improve code simplicity.
Signed-off-by: Hao Chen chenhao288@hisilicon.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../net/ethernet/hisilicon/hns3/hns3_enet.c | 55 ++++++++++++------- 1 file changed, 35 insertions(+), 20 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 6920f59e2283..6619bf4baa58 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -3545,6 +3545,38 @@ static bool hns3_can_reuse_page(struct hns3_desc_cb *cb) return page_count(cb->priv) == cb->pagecnt_bias; }
+static int hns3_handle_rx_copybreak(struct sk_buff *skb, int i, + struct hns3_enet_ring *ring, + int pull_len, + struct hns3_desc_cb *desc_cb) +{ + struct hns3_desc *desc = &ring->desc[ring->next_to_clean]; + u32 frag_offset = desc_cb->page_offset + pull_len; + int size = le16_to_cpu(desc->rx.size); + u32 frag_size = size - pull_len; + void *frag = napi_alloc_frag(frag_size); + + if (unlikely(!frag)) { + u64_stats_update_begin(&ring->syncp); + ring->stats.frag_alloc_err++; + u64_stats_update_end(&ring->syncp); + + hns3_rl_err(ring_to_netdev(ring), + "failed to allocate rx frag\n"); + return -ENOMEM; + } + + desc_cb->reuse_flag = 1; + memcpy(frag, desc_cb->buf + frag_offset, frag_size); + skb_add_rx_frag(skb, i, virt_to_page(frag), + offset_in_page(frag), frag_size, frag_size); + + u64_stats_update_begin(&ring->syncp); + ring->stats.frag_alloc++; + u64_stats_update_end(&ring->syncp); + return 0; +} + static void hns3_nic_reuse_page(struct sk_buff *skb, int i, struct hns3_enet_ring *ring, int pull_len, struct hns3_desc_cb *desc_cb) @@ -3554,6 +3586,7 @@ static void hns3_nic_reuse_page(struct sk_buff *skb, int i, int size = le16_to_cpu(desc->rx.size); u32 truesize = hns3_buf_size(ring); u32 frag_size = size - pull_len; + int ret = 0; bool reused;
if (ring->page_pool) { @@ -3588,27 +3621,9 @@ static void hns3_nic_reuse_page(struct sk_buff *skb, int i, desc_cb->page_offset = 0; desc_cb->reuse_flag = 1; } else if (frag_size <= ring->rx_copybreak) { - void *frag = napi_alloc_frag(frag_size); - - if (unlikely(!frag)) { - u64_stats_update_begin(&ring->syncp); - ring->stats.frag_alloc_err++; - u64_stats_update_end(&ring->syncp); - - hns3_rl_err(ring_to_netdev(ring), - "failed to allocate rx frag\n"); + ret = hns3_handle_rx_copybreak(skb, i, ring, pull_len, desc_cb); + if (ret) goto out; - } - - desc_cb->reuse_flag = 1; - memcpy(frag, desc_cb->buf + frag_offset, frag_size); - skb_add_rx_frag(skb, i, virt_to_page(frag), - offset_in_page(frag), frag_size, frag_size); - - u64_stats_update_begin(&ring->syncp); - ring->stats.frag_alloc++; - u64_stats_update_end(&ring->syncp); - return; }
out:
From: Jie Wang wangjie125@huawei.com
mainline inclusion from mainline-master commit e6fe5e167185 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Use for statement to optimize some print work of function hclge_dbg_dump_rst_info() and hclge_dbg_dump_mac_enable_status() to improve code simplicity.
Signed-off-by: Jie Wang wangjie125@huawei.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../hisilicon/hns3/hns3pf/hclge_debugfs.c | 93 +++++++++---------- .../hisilicon/hns3/hns3pf/hclge_debugfs.h | 5 + 2 files changed, 48 insertions(+), 50 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c index 4e0a8c2f7c05..65168125c42e 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c @@ -258,12 +258,29 @@ hclge_dbg_dump_reg_common(struct hclge_dev *hdev, return 0; }
+static const struct hclge_dbg_status_dfx_info hclge_dbg_mac_en_status[] = { + {HCLGE_MAC_TX_EN_B, "mac_trans_en"}, + {HCLGE_MAC_RX_EN_B, "mac_rcv_en"}, + {HCLGE_MAC_PAD_TX_B, "pad_trans_en"}, + {HCLGE_MAC_PAD_RX_B, "pad_rcv_en"}, + {HCLGE_MAC_1588_TX_B, "1588_trans_en"}, + {HCLGE_MAC_1588_RX_B, "1588_rcv_en"}, + {HCLGE_MAC_APP_LP_B, "mac_app_loop_en"}, + {HCLGE_MAC_LINE_LP_B, "mac_line_loop_en"}, + {HCLGE_MAC_FCS_TX_B, "mac_fcs_tx_en"}, + {HCLGE_MAC_RX_OVERSIZE_TRUNCATE_B, "mac_rx_oversize_truncate_en"}, + {HCLGE_MAC_RX_FCS_STRIP_B, "mac_rx_fcs_strip_en"}, + {HCLGE_MAC_RX_FCS_B, "mac_rx_fcs_en"}, + {HCLGE_MAC_TX_UNDER_MIN_ERR_B, "mac_tx_under_min_err_en"}, + {HCLGE_MAC_TX_OVERSIZE_TRUNCATE_B, "mac_tx_oversize_truncate_en"} +}; + static int hclge_dbg_dump_mac_enable_status(struct hclge_dev *hdev, char *buf, int len, int *pos) { struct hclge_config_mac_mode_cmd *req; struct hclge_desc desc; - u32 loop_en; + u32 loop_en, i, offset; int ret;
hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_CONFIG_MAC_MODE, true); @@ -278,39 +295,12 @@ static int hclge_dbg_dump_mac_enable_status(struct hclge_dev *hdev, char *buf, req = (struct hclge_config_mac_mode_cmd *)desc.data; loop_en = le32_to_cpu(req->txrx_pad_fcs_loop_en);
- *pos += scnprintf(buf + *pos, len - *pos, "mac_trans_en: %#x\n", - hnae3_get_bit(loop_en, HCLGE_MAC_TX_EN_B)); - *pos += scnprintf(buf + *pos, len - *pos, "mac_rcv_en: %#x\n", - hnae3_get_bit(loop_en, HCLGE_MAC_RX_EN_B)); - *pos += scnprintf(buf + *pos, len - *pos, "pad_trans_en: %#x\n", - hnae3_get_bit(loop_en, HCLGE_MAC_PAD_TX_B)); - *pos += scnprintf(buf + *pos, len - *pos, "pad_rcv_en: %#x\n", - hnae3_get_bit(loop_en, HCLGE_MAC_PAD_RX_B)); - *pos += scnprintf(buf + *pos, len - *pos, "1588_trans_en: %#x\n", - hnae3_get_bit(loop_en, HCLGE_MAC_1588_TX_B)); - *pos += scnprintf(buf + *pos, len - *pos, "1588_rcv_en: %#x\n", - hnae3_get_bit(loop_en, HCLGE_MAC_1588_RX_B)); - *pos += scnprintf(buf + *pos, len - *pos, "mac_app_loop_en: %#x\n", - hnae3_get_bit(loop_en, HCLGE_MAC_APP_LP_B)); - *pos += scnprintf(buf + *pos, len - *pos, "mac_line_loop_en: %#x\n", - hnae3_get_bit(loop_en, HCLGE_MAC_LINE_LP_B)); - *pos += scnprintf(buf + *pos, len - *pos, "mac_fcs_tx_en: %#x\n", - hnae3_get_bit(loop_en, HCLGE_MAC_FCS_TX_B)); - *pos += scnprintf(buf + *pos, len - *pos, - "mac_rx_oversize_truncate_en: %#x\n", - hnae3_get_bit(loop_en, - HCLGE_MAC_RX_OVERSIZE_TRUNCATE_B)); - *pos += scnprintf(buf + *pos, len - *pos, "mac_rx_fcs_strip_en: %#x\n", - hnae3_get_bit(loop_en, HCLGE_MAC_RX_FCS_STRIP_B)); - *pos += scnprintf(buf + *pos, len - *pos, "mac_rx_fcs_en: %#x\n", - hnae3_get_bit(loop_en, HCLGE_MAC_RX_FCS_B)); - *pos += scnprintf(buf + *pos, len - *pos, - "mac_tx_under_min_err_en: %#x\n", - hnae3_get_bit(loop_en, HCLGE_MAC_TX_UNDER_MIN_ERR_B)); - *pos += scnprintf(buf + *pos, len - *pos, - "mac_tx_oversize_truncate_en: %#x\n", - hnae3_get_bit(loop_en, - HCLGE_MAC_TX_OVERSIZE_TRUNCATE_B)); + for (i = 0; i < ARRAY_SIZE(hclge_dbg_mac_en_status); i++) { + offset = hclge_dbg_mac_en_status[i].offset; + *pos += scnprintf(buf + *pos, len - *pos, "%s: %#x\n", + hclge_dbg_mac_en_status[i].message, + hnae3_get_bit(loop_en, offset)); + }
return 0; } @@ -1614,8 +1604,19 @@ static int hclge_dbg_dump_fd_counter(struct hclge_dev *hdev, char *buf, int len) return 0; }
+static const struct hclge_dbg_status_dfx_info hclge_dbg_rst_info[] = { + {HCLGE_MISC_VECTOR_REG_BASE, "vector0 interrupt enable status"}, + {HCLGE_MISC_RESET_STS_REG, "reset interrupt source"}, + {HCLGE_MISC_VECTOR_INT_STS, "reset interrupt status"}, + {HCLGE_RAS_PF_OTHER_INT_STS_REG, "RAS interrupt status"}, + {HCLGE_GLOBAL_RESET_REG, "hardware reset status"}, + {HCLGE_NIC_CSQ_DEPTH_REG, "handshake status"}, + {HCLGE_FUN_RST_ING, "function reset status"} +}; + int hclge_dbg_dump_rst_info(struct hclge_dev *hdev, char *buf, int len) { + u32 i, offset; int pos = 0;
pos += scnprintf(buf + pos, len - pos, "PF reset count: %u\n", @@ -1634,22 +1635,14 @@ int hclge_dbg_dump_rst_info(struct hclge_dev *hdev, char *buf, int len) hdev->rst_stats.reset_cnt); pos += scnprintf(buf + pos, len - pos, "reset fail count: %u\n", hdev->rst_stats.reset_fail_cnt); - pos += scnprintf(buf + pos, len - pos, - "vector0 interrupt enable status: 0x%x\n", - hclge_read_dev(&hdev->hw, HCLGE_MISC_VECTOR_REG_BASE)); - pos += scnprintf(buf + pos, len - pos, "reset interrupt source: 0x%x\n", - hclge_read_dev(&hdev->hw, HCLGE_MISC_RESET_STS_REG)); - pos += scnprintf(buf + pos, len - pos, "reset interrupt status: 0x%x\n", - hclge_read_dev(&hdev->hw, HCLGE_MISC_VECTOR_INT_STS)); - pos += scnprintf(buf + pos, len - pos, "RAS interrupt status: 0x%x\n", - hclge_read_dev(&hdev->hw, - HCLGE_RAS_PF_OTHER_INT_STS_REG)); - pos += scnprintf(buf + pos, len - pos, "hardware reset status: 0x%x\n", - hclge_read_dev(&hdev->hw, HCLGE_GLOBAL_RESET_REG)); - pos += scnprintf(buf + pos, len - pos, "handshake status: 0x%x\n", - hclge_read_dev(&hdev->hw, HCLGE_NIC_CSQ_DEPTH_REG)); - pos += scnprintf(buf + pos, len - pos, "function reset status: 0x%x\n", - hclge_read_dev(&hdev->hw, HCLGE_FUN_RST_ING)); + + for (i = 0; i < ARRAY_SIZE(hclge_dbg_rst_info); i++) { + offset = hclge_dbg_rst_info[i].offset; + pos += scnprintf(buf + pos, len - pos, "%s: 0x%x\n", + hclge_dbg_rst_info[i].message, + hclge_read_dev(&hdev->hw, offset)); + } + pos += scnprintf(buf + pos, len - pos, "hdev state: 0x%lx\n", hdev->state);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.h index 5b6018204f92..724052928b88 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.h @@ -94,6 +94,11 @@ struct hclge_dbg_func { char *buf, int len); };
+struct hclge_dbg_status_dfx_info { + u32 offset; + char message[HCLGE_DBG_MAX_DFX_MSG_LEN]; +}; + static const struct hclge_dbg_dfx_message hclge_dbg_bios_common_reg[] = { {false, "Reserved"}, {true, "BP_CPU_STATE"},
From: Yufeng Mo moyufeng@huawei.com
mainline inclusion from mainline-master commit a4ae2bc0abd4 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Function hns3_get_tx_timeo_queue_info() is a bit too long. So add two new functions hns3_dump_queue_stats() and hns3_dump_queue_reg() to simplify code and improve code readability.
Signed-off-by: Yufeng Mo moyufeng@huawei.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../net/ethernet/hisilicon/hns3/hns3_enet.c | 100 +++++++++--------- .../net/ethernet/hisilicon/hns3/hns3_enet.h | 5 + 2 files changed, 57 insertions(+), 48 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 6619bf4baa58..2151c206de9e 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -2658,18 +2658,8 @@ static int hns3_nic_change_mtu(struct net_device *netdev, int new_mtu) return ret; }
-static bool hns3_get_tx_timeo_queue_info(struct net_device *ndev) +static int hns3_get_timeout_queue(struct net_device *ndev) { - struct hns3_nic_priv *priv = netdev_priv(ndev); - struct hnae3_handle *h = hns3_get_handle(ndev); - struct hns3_enet_ring *tx_ring; - struct napi_struct *napi; - int timeout_queue = 0; - int hw_head, hw_tail; - int fbd_num, fbd_oft; - int ebd_num, ebd_oft; - int bd_num, bd_err; - int ring_en, tc; int i;
/* Find the stopped queue the same way the stack does */ @@ -2689,7 +2679,6 @@ static bool hns3_get_tx_timeo_queue_info(struct net_device *ndev) dql->last_obj_cnt, dql->num_queued, dql->adj_limit, dql->num_completed); #endif - timeout_queue = i; netdev_info(ndev, "queue state: 0x%lx, delta msecs: %u\n", q->state, jiffies_to_msecs(jiffies - trans_start)); @@ -2697,17 +2686,15 @@ static bool hns3_get_tx_timeo_queue_info(struct net_device *ndev) } }
- if (i == ndev->num_tx_queues) { - netdev_info(ndev, - "no netdev TX timeout queue found, timeout count: %llu\n", - priv->tx_timeout_count); - return false; - } - - priv->tx_timeout_count++; + return i; +}
- tx_ring = &priv->ring[timeout_queue]; - napi = &tx_ring->tqp_vector->napi; +static void hns3_dump_queue_stats(struct net_device *ndev, + struct hns3_enet_ring *tx_ring, + int timeout_queue) +{ + struct napi_struct *napi = &tx_ring->tqp_vector->napi; + struct hns3_nic_priv *priv = netdev_priv(ndev);
netdev_info(ndev, "tx_timeout count: %llu, queue id: %d, SW_NTU: 0x%x, SW_NTC: 0x%x, napi state: %lu\n", @@ -2723,6 +2710,48 @@ static bool hns3_get_tx_timeo_queue_info(struct net_device *ndev) "seg_pkt_cnt: %llu, tx_more: %llu, restart_queue: %llu, tx_busy: %llu\n", tx_ring->stats.seg_pkt_cnt, tx_ring->stats.tx_more, tx_ring->stats.restart_queue, tx_ring->stats.tx_busy); +} + +static void hns3_dump_queue_reg(struct net_device *ndev, + struct hns3_enet_ring *tx_ring) +{ + netdev_info(ndev, + "BD_NUM: 0x%x HW_HEAD: 0x%x, HW_TAIL: 0x%x, BD_ERR: 0x%x, INT: 0x%x\n", + hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_BD_NUM_REG), + hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_HEAD_REG), + hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_TAIL_REG), + hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_BD_ERR_REG), + readl(tx_ring->tqp_vector->mask_addr)); + netdev_info(ndev, + "RING_EN: 0x%x, TC: 0x%x, FBD_NUM: 0x%x FBD_OFT: 0x%x, EBD_NUM: 0x%x, EBD_OFT: 0x%x\n", + hns3_tqp_read_reg(tx_ring, HNS3_RING_EN_REG), + hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_TC_REG), + hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_FBDNUM_REG), + hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_OFFSET_REG), + hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_EBDNUM_REG), + hns3_tqp_read_reg(tx_ring, + HNS3_RING_TX_RING_EBD_OFFSET_REG)); +} + +static bool hns3_get_tx_timeo_queue_info(struct net_device *ndev) +{ + struct hns3_nic_priv *priv = netdev_priv(ndev); + struct hnae3_handle *h = hns3_get_handle(ndev); + struct hns3_enet_ring *tx_ring; + int timeout_queue; + + timeout_queue = hns3_get_timeout_queue(ndev); + if (timeout_queue >= ndev->num_tx_queues) { + netdev_info(ndev, + "no netdev TX timeout queue found, timeout count: %llu\n", + priv->tx_timeout_count); + return false; + } + + priv->tx_timeout_count++; + + tx_ring = &priv->ring[timeout_queue]; + hns3_dump_queue_stats(ndev, tx_ring, timeout_queue);
/* When mac received many pause frames continuous, it's unable to send * packets, which may cause tx timeout @@ -2735,32 +2764,7 @@ static bool hns3_get_tx_timeo_queue_info(struct net_device *ndev) mac_stats.tx_pause_cnt, mac_stats.rx_pause_cnt); }
- hw_head = readl_relaxed(tx_ring->tqp->io_base + - HNS3_RING_TX_RING_HEAD_REG); - hw_tail = readl_relaxed(tx_ring->tqp->io_base + - HNS3_RING_TX_RING_TAIL_REG); - fbd_num = readl_relaxed(tx_ring->tqp->io_base + - HNS3_RING_TX_RING_FBDNUM_REG); - fbd_oft = readl_relaxed(tx_ring->tqp->io_base + - HNS3_RING_TX_RING_OFFSET_REG); - ebd_num = readl_relaxed(tx_ring->tqp->io_base + - HNS3_RING_TX_RING_EBDNUM_REG); - ebd_oft = readl_relaxed(tx_ring->tqp->io_base + - HNS3_RING_TX_RING_EBD_OFFSET_REG); - bd_num = readl_relaxed(tx_ring->tqp->io_base + - HNS3_RING_TX_RING_BD_NUM_REG); - bd_err = readl_relaxed(tx_ring->tqp->io_base + - HNS3_RING_TX_RING_BD_ERR_REG); - ring_en = readl_relaxed(tx_ring->tqp->io_base + HNS3_RING_EN_REG); - tc = readl_relaxed(tx_ring->tqp->io_base + HNS3_RING_TX_RING_TC_REG); - - netdev_info(ndev, - "BD_NUM: 0x%x HW_HEAD: 0x%x, HW_TAIL: 0x%x, BD_ERR: 0x%x, INT: 0x%x\n", - bd_num, hw_head, hw_tail, bd_err, - readl(tx_ring->tqp_vector->mask_addr)); - netdev_info(ndev, - "RING_EN: 0x%x, TC: 0x%x, FBD_NUM: 0x%x FBD_OFT: 0x%x, EBD_NUM: 0x%x, EBD_OFT: 0x%x\n", - ring_en, tc, fbd_num, fbd_oft, ebd_num, ebd_oft); + hns3_dump_queue_reg(ndev, tx_ring);
return true; } diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index 361a6390e159..808405cc0280 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -621,6 +621,11 @@ static inline int ring_space(struct hns3_enet_ring *ring) (begin - end)) - 1; }
+static inline u32 hns3_tqp_read_reg(struct hns3_enet_ring *ring, u32 reg) +{ + return readl_relaxed(ring->tqp->io_base + reg); +} + static inline u32 hns3_read_reg(void __iomem *base, u32 reg) { return readl(base + reg);
From: Guangbin Huang huangguangbin2@huawei.com
mainline inclusion from mainline-master commit e46da6a3d4d3 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
To reuse the code of converting speed of driver to speed of firmware in function hclge_cfg_mac_speed_dup_hw(), encapsulate them into a new function hclge_convert_to_fw_speed().
Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../hisilicon/hns3/hns3pf/hclge_main.c | 71 +++++++++---------- .../hisilicon/hns3/hns3pf/hclge_main.h | 5 ++ 2 files changed, 37 insertions(+), 39 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 167391187ade..753e6c2fff80 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -2653,11 +2653,38 @@ static u8 hclge_check_speed_dup(u8 duplex, int speed) return duplex; }
+struct hclge_mac_speed_map hclge_mac_speed_map_to_fw[] = { + {HCLGE_MAC_SPEED_10M, HCLGE_FW_MAC_SPEED_10M}, + {HCLGE_MAC_SPEED_100M, HCLGE_FW_MAC_SPEED_100M}, + {HCLGE_MAC_SPEED_1G, HCLGE_FW_MAC_SPEED_1G}, + {HCLGE_MAC_SPEED_10G, HCLGE_FW_MAC_SPEED_10G}, + {HCLGE_MAC_SPEED_25G, HCLGE_FW_MAC_SPEED_25G}, + {HCLGE_MAC_SPEED_40G, HCLGE_FW_MAC_SPEED_40G}, + {HCLGE_MAC_SPEED_50G, HCLGE_FW_MAC_SPEED_50G}, + {HCLGE_MAC_SPEED_100G, HCLGE_FW_MAC_SPEED_100G}, + {HCLGE_MAC_SPEED_200G, HCLGE_FW_MAC_SPEED_200G}, +}; + +static int hclge_convert_to_fw_speed(u32 speed_drv, u32 *speed_fw) +{ + u16 i; + + for (i = 0; i < ARRAY_SIZE(hclge_mac_speed_map_to_fw); i++) { + if (hclge_mac_speed_map_to_fw[i].speed_drv == speed_drv) { + *speed_fw = hclge_mac_speed_map_to_fw[i].speed_fw; + return 0; + } + } + + return -EINVAL; +} + static int hclge_cfg_mac_speed_dup_hw(struct hclge_dev *hdev, int speed, u8 duplex) { struct hclge_config_mac_speed_dup_cmd *req; struct hclge_desc desc; + u32 speed_fw; int ret;
req = (struct hclge_config_mac_speed_dup_cmd *)desc.data; @@ -2667,48 +2694,14 @@ static int hclge_cfg_mac_speed_dup_hw(struct hclge_dev *hdev, int speed, if (duplex) hnae3_set_bit(req->speed_dup, HCLGE_CFG_DUPLEX_B, 1);
- switch (speed) { - case HCLGE_MAC_SPEED_10M: - hnae3_set_field(req->speed_dup, HCLGE_CFG_SPEED_M, - HCLGE_CFG_SPEED_S, HCLGE_FW_MAC_SPEED_10M); - break; - case HCLGE_MAC_SPEED_100M: - hnae3_set_field(req->speed_dup, HCLGE_CFG_SPEED_M, - HCLGE_CFG_SPEED_S, HCLGE_FW_MAC_SPEED_100M); - break; - case HCLGE_MAC_SPEED_1G: - hnae3_set_field(req->speed_dup, HCLGE_CFG_SPEED_M, - HCLGE_CFG_SPEED_S, HCLGE_FW_MAC_SPEED_1G); - break; - case HCLGE_MAC_SPEED_10G: - hnae3_set_field(req->speed_dup, HCLGE_CFG_SPEED_M, - HCLGE_CFG_SPEED_S, HCLGE_FW_MAC_SPEED_10G); - break; - case HCLGE_MAC_SPEED_25G: - hnae3_set_field(req->speed_dup, HCLGE_CFG_SPEED_M, - HCLGE_CFG_SPEED_S, HCLGE_FW_MAC_SPEED_25G); - break; - case HCLGE_MAC_SPEED_40G: - hnae3_set_field(req->speed_dup, HCLGE_CFG_SPEED_M, - HCLGE_CFG_SPEED_S, HCLGE_FW_MAC_SPEED_40G); - break; - case HCLGE_MAC_SPEED_50G: - hnae3_set_field(req->speed_dup, HCLGE_CFG_SPEED_M, - HCLGE_CFG_SPEED_S, HCLGE_FW_MAC_SPEED_50G); - break; - case HCLGE_MAC_SPEED_100G: - hnae3_set_field(req->speed_dup, HCLGE_CFG_SPEED_M, - HCLGE_CFG_SPEED_S, HCLGE_FW_MAC_SPEED_100G); - break; - case HCLGE_MAC_SPEED_200G: - hnae3_set_field(req->speed_dup, HCLGE_CFG_SPEED_M, - HCLGE_CFG_SPEED_S, HCLGE_FW_MAC_SPEED_200G); - break; - default: + ret = hclge_convert_to_fw_speed(speed, &speed_fw); + if (ret) { dev_err(&hdev->pdev->dev, "invalid speed (%d)\n", speed); - return -EINVAL; + return ret; }
+ hnae3_set_field(req->speed_dup, HCLGE_CFG_SPEED_M, HCLGE_CFG_SPEED_S, + speed_fw); hnae3_set_bit(req->mac_change_fec_en, HCLGE_CFG_MAC_SPEED_CHANGE_EN_B, 1);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index 42ce1eee33c4..a51418fdbb24 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -1095,6 +1095,11 @@ struct hclge_speed_bit_map { u32 speed_bit; };
+struct hclge_mac_speed_map { + u32 speed_drv; /* speed defined in driver */ + u32 speed_fw; /* speed defined in firmware */ +}; + int hclge_set_vport_promisc_mode(struct hclge_vport *vport, bool en_uc_pmc, bool en_mc_pmc, bool en_bc_pmc); int hclge_add_uc_addr_common(struct hclge_vport *vport,
From: Guangbin Huang huangguangbin2@huawei.com
mainline inclusion from mainline-master commit 7ca561be11d0 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
This patch encapsulates the process code of tc based schedule mode of function hclge_tm_lvl34_schd_mode_cfg() into a new function hclge_tm_schd_mode_tc_base_cfg(). It make code more concise and the new process code can be reused.
Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../ethernet/hisilicon/hns3/hns3pf/hclge_tm.c | 33 +++++++++++++------ 1 file changed, 23 insertions(+), 10 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c index 429652a8cde1..1afd305ebd36 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c @@ -1274,6 +1274,27 @@ static int hclge_tm_lvl2_schd_mode_cfg(struct hclge_dev *hdev) return 0; }
+static int hclge_tm_schd_mode_tc_base_cfg(struct hclge_dev *hdev, u8 pri_id) +{ + struct hclge_vport *vport = hdev->vport; + int ret; + u16 i; + + ret = hclge_tm_pri_schd_mode_cfg(hdev, pri_id); + if (ret) + return ret; + + for (i = 0; i < hdev->num_alloc_vport; i++) { + ret = hclge_tm_qs_schd_mode_cfg(hdev, + vport[i].qs_offset + pri_id, + HCLGE_SCH_MODE_DWRR); + if (ret) + return ret; + } + + return 0; +} + static int hclge_tm_schd_mode_vnet_base_cfg(struct hclge_vport *vport) { struct hnae3_knic_private_info *kinfo = &vport->nic.kinfo; @@ -1304,21 +1325,13 @@ static int hclge_tm_lvl34_schd_mode_cfg(struct hclge_dev *hdev) { struct hclge_vport *vport = hdev->vport; int ret; - u8 i, k; + u8 i;
if (hdev->tx_sch_mode == HCLGE_FLAG_TC_BASE_SCH_MODE) { for (i = 0; i < hdev->tm_info.num_tc; i++) { - ret = hclge_tm_pri_schd_mode_cfg(hdev, i); + ret = hclge_tm_schd_mode_tc_base_cfg(hdev, i); if (ret) return ret; - - for (k = 0; k < hdev->num_alloc_vport; k++) { - ret = hclge_tm_qs_schd_mode_cfg( - hdev, vport[k].qs_offset + i, - HCLGE_SCH_MODE_DWRR); - if (ret) - return ret; - } } } else { for (i = 0; i < hdev->num_alloc_vport; i++) {
From: Guangbin Huang huangguangbin2@huawei.com
mainline inclusion from mainline-master commit e06dac5290b7 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
This patch encapsulates the process code for queue to qset config of two mode(tc based and vnet based) into two function, for making code more concise.
Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../ethernet/hisilicon/hns3/hns3pf/hclge_tm.c | 75 ++++++++++++------- 1 file changed, 50 insertions(+), 25 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c index 1afd305ebd36..3edbfc8d17e8 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c @@ -916,38 +916,63 @@ static int hclge_vport_q_to_qs_map(struct hclge_dev *hdev, return 0; }
-static int hclge_tm_pri_q_qs_cfg(struct hclge_dev *hdev) +static int hclge_tm_pri_q_qs_cfg_tc_base(struct hclge_dev *hdev) { struct hclge_vport *vport = hdev->vport; + u16 i, k; int ret; - u32 i, k;
- if (hdev->tx_sch_mode == HCLGE_FLAG_TC_BASE_SCH_MODE) { - /* Cfg qs -> pri mapping, one by one mapping */ - for (k = 0; k < hdev->num_alloc_vport; k++) { - struct hnae3_knic_private_info *kinfo = - &vport[k].nic.kinfo; - - for (i = 0; i < kinfo->tc_info.num_tc; i++) { - ret = hclge_tm_qs_to_pri_map_cfg( - hdev, vport[k].qs_offset + i, i); - if (ret) - return ret; - } + /* Cfg qs -> pri mapping, one by one mapping */ + for (k = 0; k < hdev->num_alloc_vport; k++) { + struct hnae3_knic_private_info *kinfo = &vport[k].nic.kinfo; + + for (i = 0; i < kinfo->tc_info.num_tc; i++) { + ret = hclge_tm_qs_to_pri_map_cfg(hdev, + vport[k].qs_offset + i, + i); + if (ret) + return ret; } - } else if (hdev->tx_sch_mode == HCLGE_FLAG_VNET_BASE_SCH_MODE) { - /* Cfg qs -> pri mapping, qs = tc, pri = vf, 8 qs -> 1 pri */ - for (k = 0; k < hdev->num_alloc_vport; k++) - for (i = 0; i < HNAE3_MAX_TC; i++) { - ret = hclge_tm_qs_to_pri_map_cfg( - hdev, vport[k].qs_offset + i, k); - if (ret) - return ret; - } - } else { - return -EINVAL; }
+ return 0; +} + +static int hclge_tm_pri_q_qs_cfg_vnet_base(struct hclge_dev *hdev) +{ + struct hclge_vport *vport = hdev->vport; + u16 i, k; + int ret; + + /* Cfg qs -> pri mapping, qs = tc, pri = vf, 8 qs -> 1 pri */ + for (k = 0; k < hdev->num_alloc_vport; k++) + for (i = 0; i < HNAE3_MAX_TC; i++) { + ret = hclge_tm_qs_to_pri_map_cfg(hdev, + vport[k].qs_offset + i, + k); + if (ret) + return ret; + } + + return 0; +} + +static int hclge_tm_pri_q_qs_cfg(struct hclge_dev *hdev) +{ + struct hclge_vport *vport = hdev->vport; + int ret; + u32 i; + + if (hdev->tx_sch_mode == HCLGE_FLAG_TC_BASE_SCH_MODE) + ret = hclge_tm_pri_q_qs_cfg_tc_base(hdev); + else if (hdev->tx_sch_mode == HCLGE_FLAG_VNET_BASE_SCH_MODE) + ret = hclge_tm_pri_q_qs_cfg_vnet_base(hdev); + else + return -EINVAL; + + if (ret) + return ret; + /* Cfg q -> qs mapping */ for (i = 0; i < hdev->num_alloc_vport; i++) { ret = hclge_vport_q_to_qs_map(hdev, vport);
From: Yufeng Mo moyufeng@huawei.com
mainline inclusion from mainline-master commit 8469b645c9a1 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Function hns3_nic_get_stats64() is a bit too long. So add a new function hns3_fetch_stats() to simplify code and improve code readability.
Signed-off-by: Yufeng Mo moyufeng@huawei.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../net/ethernet/hisilicon/hns3/hns3_enet.c | 123 +++++++++--------- 1 file changed, 61 insertions(+), 62 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 2151c206de9e..b64a4c29e78c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -2382,90 +2382,89 @@ static netdev_features_t hns3_features_check(struct sk_buff *skb, return features; }
+static void hns3_fetch_stats(struct rtnl_link_stats64 *stats, + struct hns3_enet_ring *ring, bool is_tx) +{ + unsigned int start; + + do { + start = u64_stats_fetch_begin_irq(&ring->syncp); + if (is_tx) { + stats->tx_bytes += ring->stats.tx_bytes; + stats->tx_packets += ring->stats.tx_pkts; + stats->tx_dropped += ring->stats.sw_err_cnt; + stats->tx_dropped += ring->stats.tx_vlan_err; + stats->tx_dropped += ring->stats.tx_l4_proto_err; + stats->tx_dropped += ring->stats.tx_l2l3l4_err; + stats->tx_dropped += ring->stats.tx_tso_err; + stats->tx_dropped += ring->stats.over_max_recursion; + stats->tx_dropped += ring->stats.hw_limitation; + stats->tx_dropped += ring->stats.copy_bits_err; + stats->tx_dropped += ring->stats.skb2sgl_err; + stats->tx_dropped += ring->stats.map_sg_err; + stats->tx_errors += ring->stats.sw_err_cnt; + stats->tx_errors += ring->stats.tx_vlan_err; + stats->tx_errors += ring->stats.tx_l4_proto_err; + stats->tx_errors += ring->stats.tx_l2l3l4_err; + stats->tx_errors += ring->stats.tx_tso_err; + stats->tx_errors += ring->stats.over_max_recursion; + stats->tx_errors += ring->stats.hw_limitation; + stats->tx_errors += ring->stats.copy_bits_err; + stats->tx_errors += ring->stats.skb2sgl_err; + stats->tx_errors += ring->stats.map_sg_err; + } else { + stats->rx_bytes += ring->stats.rx_bytes; + stats->rx_packets += ring->stats.rx_pkts; + stats->rx_dropped += ring->stats.l2_err; + stats->rx_errors += ring->stats.l2_err; + stats->rx_errors += ring->stats.l3l4_csum_err; + stats->rx_crc_errors += ring->stats.l2_err; + stats->multicast += ring->stats.rx_multicast; + stats->rx_length_errors += ring->stats.err_pkt_len; + } + } while (u64_stats_fetch_retry_irq(&ring->syncp, start)); +} + static void hns3_nic_get_stats64(struct net_device *netdev, struct rtnl_link_stats64 *stats) { struct hns3_nic_priv *priv = netdev_priv(netdev); int queue_num = priv->ae_handle->kinfo.num_tqps; struct hnae3_handle *handle = priv->ae_handle; + struct rtnl_link_stats64 ring_total_stats; struct hns3_enet_ring *ring; - u64 rx_length_errors = 0; - u64 rx_crc_errors = 0; - u64 rx_multicast = 0; - unsigned int start; - u64 tx_errors = 0; - u64 rx_errors = 0; unsigned int idx; - u64 tx_bytes = 0; - u64 rx_bytes = 0; - u64 tx_pkts = 0; - u64 rx_pkts = 0; - u64 tx_drop = 0; - u64 rx_drop = 0;
if (test_bit(HNS3_NIC_STATE_DOWN, &priv->state)) return;
handle->ae_algo->ops->update_stats(handle, &netdev->stats);
+ memset(&ring_total_stats, 0, sizeof(ring_total_stats)); for (idx = 0; idx < queue_num; idx++) { /* fetch the tx stats */ ring = &priv->ring[idx]; - do { - start = u64_stats_fetch_begin_irq(&ring->syncp); - tx_bytes += ring->stats.tx_bytes; - tx_pkts += ring->stats.tx_pkts; - tx_drop += ring->stats.sw_err_cnt; - tx_drop += ring->stats.tx_vlan_err; - tx_drop += ring->stats.tx_l4_proto_err; - tx_drop += ring->stats.tx_l2l3l4_err; - tx_drop += ring->stats.tx_tso_err; - tx_drop += ring->stats.over_max_recursion; - tx_drop += ring->stats.hw_limitation; - tx_drop += ring->stats.copy_bits_err; - tx_drop += ring->stats.skb2sgl_err; - tx_drop += ring->stats.map_sg_err; - tx_errors += ring->stats.sw_err_cnt; - tx_errors += ring->stats.tx_vlan_err; - tx_errors += ring->stats.tx_l4_proto_err; - tx_errors += ring->stats.tx_l2l3l4_err; - tx_errors += ring->stats.tx_tso_err; - tx_errors += ring->stats.over_max_recursion; - tx_errors += ring->stats.hw_limitation; - tx_errors += ring->stats.copy_bits_err; - tx_errors += ring->stats.skb2sgl_err; - tx_errors += ring->stats.map_sg_err; - } while (u64_stats_fetch_retry_irq(&ring->syncp, start)); + hns3_fetch_stats(&ring_total_stats, ring, true);
/* fetch the rx stats */ ring = &priv->ring[idx + queue_num]; - do { - start = u64_stats_fetch_begin_irq(&ring->syncp); - rx_bytes += ring->stats.rx_bytes; - rx_pkts += ring->stats.rx_pkts; - rx_drop += ring->stats.l2_err; - rx_errors += ring->stats.l2_err; - rx_errors += ring->stats.l3l4_csum_err; - rx_crc_errors += ring->stats.l2_err; - rx_multicast += ring->stats.rx_multicast; - rx_length_errors += ring->stats.err_pkt_len; - } while (u64_stats_fetch_retry_irq(&ring->syncp, start)); - } - - stats->tx_bytes = tx_bytes; - stats->tx_packets = tx_pkts; - stats->rx_bytes = rx_bytes; - stats->rx_packets = rx_pkts; - - stats->rx_errors = rx_errors; - stats->multicast = rx_multicast; - stats->rx_length_errors = rx_length_errors; - stats->rx_crc_errors = rx_crc_errors; + hns3_fetch_stats(&ring_total_stats, ring, false); + } + + stats->tx_bytes = ring_total_stats.tx_bytes; + stats->tx_packets = ring_total_stats.tx_packets; + stats->rx_bytes = ring_total_stats.rx_bytes; + stats->rx_packets = ring_total_stats.rx_packets; + + stats->rx_errors = ring_total_stats.rx_errors; + stats->multicast = ring_total_stats.multicast; + stats->rx_length_errors = ring_total_stats.rx_length_errors; + stats->rx_crc_errors = ring_total_stats.rx_crc_errors; stats->rx_missed_errors = netdev->stats.rx_missed_errors;
- stats->tx_errors = tx_errors; - stats->rx_dropped = rx_drop; - stats->tx_dropped = tx_drop; + stats->tx_errors = ring_total_stats.tx_errors; + stats->rx_dropped = ring_total_stats.rx_dropped; + stats->tx_dropped = ring_total_stats.tx_dropped; stats->collisions = netdev->stats.collisions; stats->rx_over_errors = netdev->stats.rx_over_errors; stats->rx_frame_errors = netdev->stats.rx_frame_errors;
From: Yufeng Mo moyufeng@huawei.com
mainline inclusion from mainline-master commit 2fbf6a07f537 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Function hns3_handle_bdinfo() is a bit too long. So add two new functions hns3_handle_rx_ts_info() and hns3_handle_rx_vlan_tag( to simplify code and improve code readability.
Signed-off-by: Yufeng Mo moyufeng@huawei.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../net/ethernet/hisilicon/hns3/hns3_enet.c | 54 ++++++++++++------- 1 file changed, 35 insertions(+), 19 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index b64a4c29e78c..ff153ce3a2a9 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -4042,6 +4042,39 @@ static void hns3_set_rx_skb_rss_type(struct hns3_enet_ring *ring, skb_set_hash(skb, rss_hash, rss_type); }
+static void hns3_handle_rx_ts_info(struct net_device *netdev, + struct hns3_desc *desc, struct sk_buff *skb, + u32 bd_base_info) +{ + if (unlikely(bd_base_info & BIT(HNS3_RXD_TS_VLD_B))) { + struct hnae3_handle *h = hns3_get_handle(netdev); + u32 nsec = le32_to_cpu(desc->ts_nsec); + u32 sec = le32_to_cpu(desc->ts_sec); + + if (h->ae_algo->ops->get_rx_hwts) + h->ae_algo->ops->get_rx_hwts(h, skb, nsec, sec); + } +} + +static void hns3_handle_rx_vlan_tag(struct hns3_enet_ring *ring, + struct hns3_desc *desc, struct sk_buff *skb, + u32 l234info) +{ + struct net_device *netdev = ring_to_netdev(ring); + + /* Based on hw strategy, the tag offloaded will be stored at + * ot_vlan_tag in two layer tag case, and stored at vlan_tag + * in one layer tag case. + */ + if (netdev->features & NETIF_F_HW_VLAN_CTAG_RX) { + u16 vlan_tag; + + if (hns3_parse_vlan_tag(ring, desc, l234info, &vlan_tag)) + __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), + vlan_tag); + } +} + static int hns3_handle_bdinfo(struct hns3_enet_ring *ring, struct sk_buff *skb) { struct net_device *netdev = ring_to_netdev(ring); @@ -4064,26 +4097,9 @@ static int hns3_handle_bdinfo(struct hns3_enet_ring *ring, struct sk_buff *skb) ol_info = le32_to_cpu(desc->rx.ol_info); csum = le16_to_cpu(desc->csum);
- if (unlikely(bd_base_info & BIT(HNS3_RXD_TS_VLD_B))) { - struct hnae3_handle *h = hns3_get_handle(netdev); - u32 nsec = le32_to_cpu(desc->ts_nsec); - u32 sec = le32_to_cpu(desc->ts_sec); - - if (h->ae_algo->ops->get_rx_hwts) - h->ae_algo->ops->get_rx_hwts(h, skb, nsec, sec); - } + hns3_handle_rx_ts_info(netdev, desc, skb, bd_base_info);
- /* Based on hw strategy, the tag offloaded will be stored at - * ot_vlan_tag in two layer tag case, and stored at vlan_tag - * in one layer tag case. - */ - if (netdev->features & NETIF_F_HW_VLAN_CTAG_RX) { - u16 vlan_tag; - - if (hns3_parse_vlan_tag(ring, desc, l234info, &vlan_tag)) - __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), - vlan_tag); - } + hns3_handle_rx_vlan_tag(ring, desc, skb, l234info);
if (unlikely(!desc->rx.pkt_len || (l234info & (BIT(HNS3_RXD_TRUNCAT_B) | BIT(HNS3_RXD_L2E_B))))) {
From: Yufeng Mo moyufeng@huawei.com
mainline inclusion from mainline-master commit 1d851c0905f8 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Function hns3_set_l2l3l4() is a bit too long. So add two new functions hns3_set_l3_type() and hns3_set_l4_csum_length() to simplify code and improve code readability.
Signed-off-by: Yufeng Mo moyufeng@huawei.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../net/ethernet/hisilicon/hns3/hns3_enet.c | 102 ++++++++++-------- 1 file changed, 57 insertions(+), 45 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index ff153ce3a2a9..1660e59115bf 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -1354,44 +1354,9 @@ static void hns3_set_outer_l2l3l4(struct sk_buff *skb, u8 ol4_proto, HNS3_TUN_NVGRE); }
-static int hns3_set_l2l3l4(struct sk_buff *skb, u8 ol4_proto, - u8 il4_proto, u32 *type_cs_vlan_tso, - u32 *ol_type_vlan_len_msec) +static void hns3_set_l3_type(struct sk_buff *skb, union l3_hdr_info l3, + u32 *type_cs_vlan_tso) { - unsigned char *l2_hdr = skb->data; - u32 l4_proto = ol4_proto; - union l4_hdr_info l4; - union l3_hdr_info l3; - u32 l2_len, l3_len; - - l4.hdr = skb_transport_header(skb); - l3.hdr = skb_network_header(skb); - - /* handle encapsulation skb */ - if (skb->encapsulation) { - /* If this is a not UDP/GRE encapsulation skb */ - if (!(ol4_proto == IPPROTO_UDP || ol4_proto == IPPROTO_GRE)) { - /* drop the skb tunnel packet if hardware don't support, - * because hardware can't calculate csum when TSO. - */ - if (skb_is_gso(skb)) - return -EDOM; - - /* the stack computes the IP header already, - * driver calculate l4 checksum when not TSO. - */ - return skb_checksum_help(skb); - } - - hns3_set_outer_l2l3l4(skb, ol4_proto, ol_type_vlan_len_msec); - - /* switch to inner header */ - l2_hdr = skb_inner_mac_header(skb); - l3.hdr = skb_inner_network_header(skb); - l4.hdr = skb_inner_transport_header(skb); - l4_proto = il4_proto; - } - if (l3.v4->version == 4) { hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L3T_S, HNS3_L3T_IPV4); @@ -1405,15 +1370,11 @@ static int hns3_set_l2l3l4(struct sk_buff *skb, u8 ol4_proto, hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L3T_S, HNS3_L3T_IPV6); } +}
- /* compute inner(/normal) L2 header size, defined in 2 Bytes */ - l2_len = l3.hdr - l2_hdr; - hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L2LEN_S, l2_len >> 1); - - /* compute inner(/normal) L3 header size, defined in 4 Bytes */ - l3_len = l4.hdr - l3.hdr; - hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L3LEN_S, l3_len >> 2); - +static int hns3_set_l4_csum_length(struct sk_buff *skb, union l4_hdr_info l4, + u32 l4_proto, u32 *type_cs_vlan_tso) +{ /* compute inner(/normal) L4 header size, defined in 4 Bytes */ switch (l4_proto) { case IPPROTO_TCP: @@ -1459,6 +1420,57 @@ static int hns3_set_l2l3l4(struct sk_buff *skb, u8 ol4_proto, return 0; }
+static int hns3_set_l2l3l4(struct sk_buff *skb, u8 ol4_proto, + u8 il4_proto, u32 *type_cs_vlan_tso, + u32 *ol_type_vlan_len_msec) +{ + unsigned char *l2_hdr = skb->data; + u32 l4_proto = ol4_proto; + union l4_hdr_info l4; + union l3_hdr_info l3; + u32 l2_len, l3_len; + + l4.hdr = skb_transport_header(skb); + l3.hdr = skb_network_header(skb); + + /* handle encapsulation skb */ + if (skb->encapsulation) { + /* If this is a not UDP/GRE encapsulation skb */ + if (!(ol4_proto == IPPROTO_UDP || ol4_proto == IPPROTO_GRE)) { + /* drop the skb tunnel packet if hardware don't support, + * because hardware can't calculate csum when TSO. + */ + if (skb_is_gso(skb)) + return -EDOM; + + /* the stack computes the IP header already, + * driver calculate l4 checksum when not TSO. + */ + return skb_checksum_help(skb); + } + + hns3_set_outer_l2l3l4(skb, ol4_proto, ol_type_vlan_len_msec); + + /* switch to inner header */ + l2_hdr = skb_inner_mac_header(skb); + l3.hdr = skb_inner_network_header(skb); + l4.hdr = skb_inner_transport_header(skb); + l4_proto = il4_proto; + } + + hns3_set_l3_type(skb, l3, type_cs_vlan_tso); + + /* compute inner(/normal) L2 header size, defined in 2 Bytes */ + l2_len = l3.hdr - l2_hdr; + hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L2LEN_S, l2_len >> 1); + + /* compute inner(/normal) L3 header size, defined in 4 Bytes */ + l3_len = l4.hdr - l3.hdr; + hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L3LEN_S, l3_len >> 2); + + return hns3_set_l4_csum_length(skb, l4, l4_proto, type_cs_vlan_tso); +} + static int hns3_handle_vtags(struct hns3_enet_ring *tx_ring, struct sk_buff *skb) {
From: Wei Yongjun weiyongjun1@huawei.com
mainline inclusion from mainline-master commit c0190879323f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
The sparse tool complains as follows:
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c:2656:28: warning: symbol 'hclge_mac_speed_map_to_fw' was not declared. Should it be static?
This symbol is not used outside of hclge_main.c, so marks it static.
Fixes: e46da6a3d4d3 ("net: hns3: refine function hclge_cfg_mac_speed_dup_hw()") Reported-by: Hulk Robot hulkci@huawei.com Signed-off-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 753e6c2fff80..5dcb06407c2b 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -2653,7 +2653,7 @@ static u8 hclge_check_speed_dup(u8 duplex, int speed) return duplex; }
-struct hclge_mac_speed_map hclge_mac_speed_map_to_fw[] = { +static struct hclge_mac_speed_map hclge_mac_speed_map_to_fw[] = { {HCLGE_MAC_SPEED_10M, HCLGE_FW_MAC_SPEED_10M}, {HCLGE_MAC_SPEED_100M, HCLGE_FW_MAC_SPEED_100M}, {HCLGE_MAC_SPEED_1G, HCLGE_FW_MAC_SPEED_1G},
From: Peng Li lipeng321@huawei.com
mainline inclusion from mainline-master commit e6d72f6ac2ad category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
As the code to update ring stats is alike for different ring stats type, this patch extract macro to simplify ring stats update code.
Signed-off-by: Peng Li lipeng321@huawei.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../net/ethernet/hisilicon/hns3/hns3_enet.c | 123 +++++------------- .../net/ethernet/hisilicon/hns3/hns3_enet.h | 7 + 2 files changed, 38 insertions(+), 92 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 1660e59115bf..2bf3aa700abc 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -1001,9 +1001,7 @@ static bool hns3_can_use_tx_bounce(struct hns3_enet_ring *ring, return false;
if (ALIGN(len, dma_get_cache_alignment()) > space) { - u64_stats_update_begin(&ring->syncp); - ring->stats.tx_spare_full++; - u64_stats_update_end(&ring->syncp); + hns3_ring_stats_update(ring, tx_spare_full); return false; }
@@ -1020,9 +1018,7 @@ static bool hns3_can_use_tx_sgl(struct hns3_enet_ring *ring, return false;
if (space < HNS3_MAX_SGL_SIZE) { - u64_stats_update_begin(&ring->syncp); - ring->stats.tx_spare_full++; - u64_stats_update_end(&ring->syncp); + hns3_ring_stats_update(ring, tx_spare_full); return false; }
@@ -1561,9 +1557,7 @@ static int hns3_fill_skb_desc(struct hns3_enet_ring *ring,
ret = hns3_handle_vtags(ring, skb); if (unlikely(ret < 0)) { - u64_stats_update_begin(&ring->syncp); - ring->stats.tx_vlan_err++; - u64_stats_update_end(&ring->syncp); + hns3_ring_stats_update(ring, tx_vlan_err); return ret; } else if (ret == HNS3_INNER_VLAN_TAG) { inner_vtag = skb_vlan_tag_get(skb); @@ -1598,9 +1592,7 @@ static int hns3_fill_skb_desc(struct hns3_enet_ring *ring,
ret = hns3_get_l4_protocol(skb, &ol4_proto, &il4_proto); if (unlikely(ret < 0)) { - u64_stats_update_begin(&ring->syncp); - ring->stats.tx_l4_proto_err++; - u64_stats_update_end(&ring->syncp); + hns3_ring_stats_update(ring, tx_l4_proto_err); return ret; }
@@ -1608,18 +1600,14 @@ static int hns3_fill_skb_desc(struct hns3_enet_ring *ring, &type_cs_vlan_tso, &ol_type_vlan_len_msec); if (unlikely(ret < 0)) { - u64_stats_update_begin(&ring->syncp); - ring->stats.tx_l2l3l4_err++; - u64_stats_update_end(&ring->syncp); + hns3_ring_stats_update(ring, tx_l2l3l4_err); return ret; }
ret = hns3_set_tso(skb, &paylen_ol4cs, &mss_hw_csum, &type_cs_vlan_tso, &desc_cb->send_bytes); if (unlikely(ret < 0)) { - u64_stats_update_begin(&ring->syncp); - ring->stats.tx_tso_err++; - u64_stats_update_end(&ring->syncp); + hns3_ring_stats_update(ring, tx_tso_err); return ret; } } @@ -1712,9 +1700,7 @@ static int hns3_map_and_fill_desc(struct hns3_enet_ring *ring, void *priv, }
if (unlikely(dma_mapping_error(dev, dma))) { - u64_stats_update_begin(&ring->syncp); - ring->stats.sw_err_cnt++; - u64_stats_update_end(&ring->syncp); + hns3_ring_stats_update(ring, sw_err_cnt); return -ENOMEM; }
@@ -1860,9 +1846,7 @@ static int hns3_skb_linearize(struct hns3_enet_ring *ring, * recursion level of over HNS3_MAX_RECURSION_LEVEL. */ if (bd_num == UINT_MAX) { - u64_stats_update_begin(&ring->syncp); - ring->stats.over_max_recursion++; - u64_stats_update_end(&ring->syncp); + hns3_ring_stats_update(ring, over_max_recursion); return -ENOMEM; }
@@ -1871,16 +1855,12 @@ static int hns3_skb_linearize(struct hns3_enet_ring *ring, */ if (skb->len > HNS3_MAX_TSO_SIZE || (!skb_is_gso(skb) && skb->len > HNS3_MAX_NON_TSO_SIZE)) { - u64_stats_update_begin(&ring->syncp); - ring->stats.hw_limitation++; - u64_stats_update_end(&ring->syncp); + hns3_ring_stats_update(ring, hw_limitation); return -ENOMEM; }
if (__skb_linearize(skb)) { - u64_stats_update_begin(&ring->syncp); - ring->stats.sw_err_cnt++; - u64_stats_update_end(&ring->syncp); + hns3_ring_stats_update(ring, sw_err_cnt); return -ENOMEM; }
@@ -1910,9 +1890,7 @@ static int hns3_nic_maybe_stop_tx(struct hns3_enet_ring *ring,
bd_num = hns3_tx_bd_count(skb->len);
- u64_stats_update_begin(&ring->syncp); - ring->stats.tx_copy++; - u64_stats_update_end(&ring->syncp); + hns3_ring_stats_update(ring, tx_copy); }
out: @@ -1932,9 +1910,7 @@ static int hns3_nic_maybe_stop_tx(struct hns3_enet_ring *ring, return bd_num; }
- u64_stats_update_begin(&ring->syncp); - ring->stats.tx_busy++; - u64_stats_update_end(&ring->syncp); + hns3_ring_stats_update(ring, tx_busy);
return -EBUSY; } @@ -2019,9 +1995,7 @@ static void hns3_tx_doorbell(struct hns3_enet_ring *ring, int num, ring->pending_buf += num;
if (!doorbell) { - u64_stats_update_begin(&ring->syncp); - ring->stats.tx_more++; - u64_stats_update_end(&ring->syncp); + hns3_ring_stats_update(ring, tx_more); return; }
@@ -2071,9 +2045,7 @@ static int hns3_handle_tx_bounce(struct hns3_enet_ring *ring, ret = skb_copy_bits(skb, 0, buf, size); if (unlikely(ret < 0)) { hns3_tx_spare_rollback(ring, cb_len); - u64_stats_update_begin(&ring->syncp); - ring->stats.copy_bits_err++; - u64_stats_update_end(&ring->syncp); + hns3_ring_stats_update(ring, copy_bits_err); return ret; }
@@ -2096,9 +2068,8 @@ static int hns3_handle_tx_bounce(struct hns3_enet_ring *ring, dma_sync_single_for_device(ring_to_dev(ring), dma, size, DMA_TO_DEVICE);
- u64_stats_update_begin(&ring->syncp); - ring->stats.tx_bounce++; - u64_stats_update_end(&ring->syncp); + hns3_ring_stats_update(ring, tx_bounce); + return bd_num; }
@@ -2128,9 +2099,7 @@ static int hns3_handle_tx_sgl(struct hns3_enet_ring *ring, nents = skb_to_sgvec(skb, sgt->sgl, 0, skb->len); if (unlikely(nents < 0)) { hns3_tx_spare_rollback(ring, cb_len); - u64_stats_update_begin(&ring->syncp); - ring->stats.skb2sgl_err++; - u64_stats_update_end(&ring->syncp); + hns3_ring_stats_update(ring, skb2sgl_err); return -ENOMEM; }
@@ -2139,9 +2108,7 @@ static int hns3_handle_tx_sgl(struct hns3_enet_ring *ring, DMA_TO_DEVICE); if (unlikely(!sgt->nents)) { hns3_tx_spare_rollback(ring, cb_len); - u64_stats_update_begin(&ring->syncp); - ring->stats.map_sg_err++; - u64_stats_update_end(&ring->syncp); + hns3_ring_stats_update(ring, map_sg_err); return -ENOMEM; }
@@ -2153,10 +2120,7 @@ static int hns3_handle_tx_sgl(struct hns3_enet_ring *ring, for (i = 0; i < sgt->nents; i++) bd_num += hns3_fill_desc(ring, sg_dma_address(sgt->sgl + i), sg_dma_len(sgt->sgl + i)); - - u64_stats_update_begin(&ring->syncp); - ring->stats.tx_sgl++; - u64_stats_update_end(&ring->syncp); + hns3_ring_stats_update(ring, tx_sgl);
return bd_num; } @@ -2195,9 +2159,7 @@ netdev_tx_t hns3_nic_net_xmit(struct sk_buff *skb, struct net_device *netdev) if (skb_put_padto(skb, HNS3_MIN_TX_LEN)) { hns3_tx_doorbell(ring, 0, !netdev_xmit_more());
- u64_stats_update_begin(&ring->syncp); - ring->stats.sw_err_cnt++; - u64_stats_update_end(&ring->syncp); + hns3_ring_stats_update(ring, sw_err_cnt);
return NETDEV_TX_OK; } @@ -3521,17 +3483,13 @@ static bool hns3_nic_alloc_rx_buffers(struct hns3_enet_ring *ring, for (i = 0; i < cleand_count; i++) { desc_cb = &ring->desc_cb[ring->next_to_use]; if (desc_cb->reuse_flag) { - u64_stats_update_begin(&ring->syncp); - ring->stats.reuse_pg_cnt++; - u64_stats_update_end(&ring->syncp); + hns3_ring_stats_update(ring, reuse_pg_cnt);
hns3_reuse_buffer(ring, ring->next_to_use); } else { ret = hns3_alloc_and_map_buffer(ring, &res_cbs); if (ret) { - u64_stats_update_begin(&ring->syncp); - ring->stats.sw_err_cnt++; - u64_stats_update_end(&ring->syncp); + hns3_ring_stats_update(ring, sw_err_cnt);
hns3_rl_err(ring_to_netdev(ring), "alloc rx buffer failed: %d\n", @@ -3543,9 +3501,7 @@ static bool hns3_nic_alloc_rx_buffers(struct hns3_enet_ring *ring, } hns3_replace_buffer(ring, ring->next_to_use, &res_cbs);
- u64_stats_update_begin(&ring->syncp); - ring->stats.non_reuse_pg++; - u64_stats_update_end(&ring->syncp); + hns3_ring_stats_update(ring, non_reuse_pg); }
ring_ptr_move_fw(ring, next_to_use); @@ -3572,9 +3528,7 @@ static int hns3_handle_rx_copybreak(struct sk_buff *skb, int i, void *frag = napi_alloc_frag(frag_size);
if (unlikely(!frag)) { - u64_stats_update_begin(&ring->syncp); - ring->stats.frag_alloc_err++; - u64_stats_update_end(&ring->syncp); + hns3_ring_stats_update(ring, frag_alloc_err);
hns3_rl_err(ring_to_netdev(ring), "failed to allocate rx frag\n"); @@ -3586,9 +3540,7 @@ static int hns3_handle_rx_copybreak(struct sk_buff *skb, int i, skb_add_rx_frag(skb, i, virt_to_page(frag), offset_in_page(frag), frag_size, frag_size);
- u64_stats_update_begin(&ring->syncp); - ring->stats.frag_alloc++; - u64_stats_update_end(&ring->syncp); + hns3_ring_stats_update(ring, frag_alloc); return 0; }
@@ -3721,9 +3673,7 @@ static bool hns3_checksum_complete(struct hns3_enet_ring *ring, hns3_rx_ptype_tbl[ptype].ip_summed != CHECKSUM_COMPLETE) return false;
- u64_stats_update_begin(&ring->syncp); - ring->stats.csum_complete++; - u64_stats_update_end(&ring->syncp); + hns3_ring_stats_update(ring, csum_complete); skb->ip_summed = CHECKSUM_COMPLETE; skb->csum = csum_unfold((__force __sum16)csum);
@@ -3797,9 +3747,7 @@ static void hns3_rx_checksum(struct hns3_enet_ring *ring, struct sk_buff *skb, if (unlikely(l234info & (BIT(HNS3_RXD_L3E_B) | BIT(HNS3_RXD_L4E_B) | BIT(HNS3_RXD_OL3E_B) | BIT(HNS3_RXD_OL4E_B)))) { - u64_stats_update_begin(&ring->syncp); - ring->stats.l3l4_csum_err++; - u64_stats_update_end(&ring->syncp); + hns3_ring_stats_update(ring, l3l4_csum_err);
return; } @@ -3890,10 +3838,7 @@ static int hns3_alloc_skb(struct hns3_enet_ring *ring, unsigned int length, skb = ring->skb; if (unlikely(!skb)) { hns3_rl_err(netdev, "alloc rx skb fail\n"); - - u64_stats_update_begin(&ring->syncp); - ring->stats.sw_err_cnt++; - u64_stats_update_end(&ring->syncp); + hns3_ring_stats_update(ring, sw_err_cnt);
return -ENOMEM; } @@ -3924,9 +3869,7 @@ static int hns3_alloc_skb(struct hns3_enet_ring *ring, unsigned int length, if (ring->page_pool) skb_mark_for_recycle(skb);
- u64_stats_update_begin(&ring->syncp); - ring->stats.seg_pkt_cnt++; - u64_stats_update_end(&ring->syncp); + hns3_ring_stats_update(ring, seg_pkt_cnt);
ring->pull_len = eth_get_headlen(netdev, va, HNS3_RX_HEAD_SIZE); __skb_put(skb, ring->pull_len); @@ -4134,9 +4077,7 @@ static int hns3_handle_bdinfo(struct hns3_enet_ring *ring, struct sk_buff *skb) ret = hns3_set_gro_and_checksum(ring, skb, l234info, bd_base_info, ol_info, csum); if (unlikely(ret)) { - u64_stats_update_begin(&ring->syncp); - ring->stats.rx_err_cnt++; - u64_stats_update_end(&ring->syncp); + hns3_ring_stats_update(ring, rx_err_cnt); return ret; }
@@ -5346,9 +5287,7 @@ static int hns3_clear_rx_ring(struct hns3_enet_ring *ring) if (!ring->desc_cb[ring->next_to_use].reuse_flag) { ret = hns3_alloc_and_map_buffer(ring, &res_cbs); if (ret) { - u64_stats_update_begin(&ring->syncp); - ring->stats.sw_err_cnt++; - u64_stats_update_end(&ring->syncp); + hns3_ring_stats_update(ring, sw_err_cnt); /* if alloc new buffer fail, exit directly * and reclear in up flow. */ diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index 808405cc0280..2803b2cd7f30 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -660,6 +660,13 @@ static inline bool hns3_nic_resetting(struct net_device *netdev)
#define hns3_buf_size(_ring) ((_ring)->buf_size)
+#define hns3_ring_stats_update(ring, cnt) do { \ + typeof(ring) (tmp) = (ring); \ + u64_stats_update_begin(&(tmp)->syncp); \ + ((tmp)->stats.cnt)++; \ + u64_stats_update_end(&(tmp)->syncp); \ +} while (0) \ + static inline unsigned int hns3_page_order(struct hns3_enet_ring *ring) { #if (PAGE_SIZE < 8192)
From: Peng Li lipeng321@huawei.com
mainline inclusion from mainline-master commit a1cfb24d011a category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
The function hns3_fill_skb_desc is hard to read, this patch extract 2 functions and add new a struct data to simplify the code and Improve readability.
Signed-off-by: Peng Li lipeng321@huawei.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../net/ethernet/hisilicon/hns3/hns3_enet.c | 147 +++++++++++------- 1 file changed, 93 insertions(+), 54 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 2bf3aa700abc..014bcfe8898d 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -1543,16 +1543,29 @@ static bool hns3_check_hw_tx_csum(struct sk_buff *skb) return true; }
-static int hns3_fill_skb_desc(struct hns3_enet_ring *ring, - struct sk_buff *skb, struct hns3_desc *desc, - struct hns3_desc_cb *desc_cb) +struct hns3_desc_param { + u32 paylen_ol4cs; + u32 ol_type_vlan_len_msec; + u32 type_cs_vlan_tso; + u16 mss_hw_csum; + u16 inner_vtag; + u16 out_vtag; +}; + +static void hns3_init_desc_data(struct sk_buff *skb, struct hns3_desc_param *pa) +{ + pa->paylen_ol4cs = skb->len; + pa->ol_type_vlan_len_msec = 0; + pa->type_cs_vlan_tso = 0; + pa->mss_hw_csum = 0; + pa->inner_vtag = 0; + pa->out_vtag = 0; +} + +static int hns3_handle_vlan_info(struct hns3_enet_ring *ring, + struct sk_buff *skb, + struct hns3_desc_param *param) { - u32 ol_type_vlan_len_msec = 0; - u32 paylen_ol4cs = skb->len; - u32 type_cs_vlan_tso = 0; - u16 mss_hw_csum = 0; - u16 inner_vtag = 0; - u16 out_vtag = 0; int ret;
ret = hns3_handle_vtags(ring, skb); @@ -1560,67 +1573,93 @@ static int hns3_fill_skb_desc(struct hns3_enet_ring *ring, hns3_ring_stats_update(ring, tx_vlan_err); return ret; } else if (ret == HNS3_INNER_VLAN_TAG) { - inner_vtag = skb_vlan_tag_get(skb); - inner_vtag |= (skb->priority << VLAN_PRIO_SHIFT) & + param->inner_vtag = skb_vlan_tag_get(skb); + param->inner_vtag |= (skb->priority << VLAN_PRIO_SHIFT) & VLAN_PRIO_MASK; - hns3_set_field(type_cs_vlan_tso, HNS3_TXD_VLAN_B, 1); + hns3_set_field(param->type_cs_vlan_tso, HNS3_TXD_VLAN_B, 1); } else if (ret == HNS3_OUTER_VLAN_TAG) { - out_vtag = skb_vlan_tag_get(skb); - out_vtag |= (skb->priority << VLAN_PRIO_SHIFT) & + param->out_vtag = skb_vlan_tag_get(skb); + param->out_vtag |= (skb->priority << VLAN_PRIO_SHIFT) & VLAN_PRIO_MASK; - hns3_set_field(ol_type_vlan_len_msec, HNS3_TXD_OVLAN_B, + hns3_set_field(param->ol_type_vlan_len_msec, HNS3_TXD_OVLAN_B, 1); } + return 0; +}
- desc_cb->send_bytes = skb->len; +static int hns3_handle_csum_partial(struct hns3_enet_ring *ring, + struct sk_buff *skb, + struct hns3_desc_cb *desc_cb, + struct hns3_desc_param *param) +{ + u8 ol4_proto, il4_proto; + int ret;
- if (skb->ip_summed == CHECKSUM_PARTIAL) { - u8 ol4_proto, il4_proto; - - if (hns3_check_hw_tx_csum(skb)) { - /* set checksum start and offset, defined in 2 Bytes */ - hns3_set_field(type_cs_vlan_tso, HNS3_TXD_CSUM_START_S, - skb_checksum_start_offset(skb) >> 1); - hns3_set_field(ol_type_vlan_len_msec, - HNS3_TXD_CSUM_OFFSET_S, - skb->csum_offset >> 1); - mss_hw_csum |= BIT(HNS3_TXD_HW_CS_B); - goto out_hw_tx_csum; - } + if (hns3_check_hw_tx_csum(skb)) { + /* set checksum start and offset, defined in 2 Bytes */ + hns3_set_field(param->type_cs_vlan_tso, HNS3_TXD_CSUM_START_S, + skb_checksum_start_offset(skb) >> 1); + hns3_set_field(param->ol_type_vlan_len_msec, + HNS3_TXD_CSUM_OFFSET_S, + skb->csum_offset >> 1); + param->mss_hw_csum |= BIT(HNS3_TXD_HW_CS_B); + return 0; + }
- skb_reset_mac_len(skb); + skb_reset_mac_len(skb);
- ret = hns3_get_l4_protocol(skb, &ol4_proto, &il4_proto); - if (unlikely(ret < 0)) { - hns3_ring_stats_update(ring, tx_l4_proto_err); - return ret; - } + ret = hns3_get_l4_protocol(skb, &ol4_proto, &il4_proto); + if (unlikely(ret < 0)) { + hns3_ring_stats_update(ring, tx_l4_proto_err); + return ret; + }
- ret = hns3_set_l2l3l4(skb, ol4_proto, il4_proto, - &type_cs_vlan_tso, - &ol_type_vlan_len_msec); - if (unlikely(ret < 0)) { - hns3_ring_stats_update(ring, tx_l2l3l4_err); - return ret; - } + ret = hns3_set_l2l3l4(skb, ol4_proto, il4_proto, + ¶m->type_cs_vlan_tso, + ¶m->ol_type_vlan_len_msec); + if (unlikely(ret < 0)) { + hns3_ring_stats_update(ring, tx_l2l3l4_err); + return ret; + } + + ret = hns3_set_tso(skb, ¶m->paylen_ol4cs, ¶m->mss_hw_csum, + ¶m->type_cs_vlan_tso, &desc_cb->send_bytes); + if (unlikely(ret < 0)) { + hns3_ring_stats_update(ring, tx_tso_err); + return ret; + } + return 0; +}
- ret = hns3_set_tso(skb, &paylen_ol4cs, &mss_hw_csum, - &type_cs_vlan_tso, &desc_cb->send_bytes); - if (unlikely(ret < 0)) { - hns3_ring_stats_update(ring, tx_tso_err); +static int hns3_fill_skb_desc(struct hns3_enet_ring *ring, + struct sk_buff *skb, struct hns3_desc *desc, + struct hns3_desc_cb *desc_cb) +{ + struct hns3_desc_param param; + u8 fd_op; + int ret; + + hns3_init_desc_data(skb, ¶m); + ret = hns3_handle_vlan_info(ring, skb, ¶m); + if (unlikely(ret < 0)) + return ret; + + desc_cb->send_bytes = skb->len; + + if (skb->ip_summed == CHECKSUM_PARTIAL) { + ret = hns3_handle_csum_partial(ring, skb, desc_cb, ¶m); + if (ret) return ret; - } }
-out_hw_tx_csum: /* Set txbd */ desc->tx.ol_type_vlan_len_msec = - cpu_to_le32(ol_type_vlan_len_msec); - desc->tx.type_cs_vlan_tso_len = cpu_to_le32(type_cs_vlan_tso); - desc->tx.paylen_ol4cs = cpu_to_le32(paylen_ol4cs); - desc->tx.mss_hw_csum = cpu_to_le16(mss_hw_csum); - desc->tx.vlan_tag = cpu_to_le16(inner_vtag); - desc->tx.outer_vlan_tag = cpu_to_le16(out_vtag); + cpu_to_le32(param.ol_type_vlan_len_msec); + desc->tx.type_cs_vlan_tso_len = cpu_to_le32(param.type_cs_vlan_tso); + desc->tx.paylen_ol4cs = cpu_to_le32(param.paylen_ol4cs); + desc->tx.mss_hw_csum = cpu_to_le16(param.mss_hw_csum); + desc->tx.vlan_tag = cpu_to_le16(param.inner_vtag); + desc->tx.outer_vlan_tag = cpu_to_le16(param.out_vtag);
return 0; }
From: Jian Shen shenjian15@huawei.com
mainline inclusion from mainline-master commit b60f9d2ec479 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Currently the function hclge_init_vlan_config() is a bit long. Split it to several small functions, to simplify code and improve code readability.
Signed-off-by: Jian Shen shenjian15@huawei.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../hisilicon/hns3/hns3pf/hclge_main.c | 97 +++++++++++-------- 1 file changed, 55 insertions(+), 42 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 5dcb06407c2b..117974bbfbb7 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -10194,67 +10194,80 @@ static int hclge_set_vlan_protocol_type(struct hclge_dev *hdev) return status; }
-static int hclge_init_vlan_config(struct hclge_dev *hdev) +static int hclge_init_vlan_filter(struct hclge_dev *hdev) { -#define HCLGE_DEF_VLAN_TYPE 0x8100 - - struct hnae3_handle *handle = &hdev->vport[0].nic; struct hclge_vport *vport; int ret; int i;
- if (hdev->ae_dev->dev_version >= HNAE3_DEVICE_VERSION_V2) { - /* for revision 0x21, vf vlan filter is per function */ - for (i = 0; i < hdev->num_alloc_vport; i++) { - vport = &hdev->vport[i]; - ret = hclge_set_vlan_filter_ctrl(hdev, - HCLGE_FILTER_TYPE_VF, - HCLGE_FILTER_FE_EGRESS, - true, - vport->vport_id); - if (ret) - return ret; - vport->cur_vlan_fltr_en = true; - } + if (hdev->ae_dev->dev_version < HNAE3_DEVICE_VERSION_V2) + return hclge_set_vlan_filter_ctrl(hdev, HCLGE_FILTER_TYPE_VF, + HCLGE_FILTER_FE_EGRESS_V1_B, + true, 0);
- ret = hclge_set_vlan_filter_ctrl(hdev, HCLGE_FILTER_TYPE_PORT, - HCLGE_FILTER_FE_INGRESS, true, - 0); - if (ret) - return ret; - } else { + /* for revision 0x21, vf vlan filter is per function */ + for (i = 0; i < hdev->num_alloc_vport; i++) { + vport = &hdev->vport[i]; ret = hclge_set_vlan_filter_ctrl(hdev, HCLGE_FILTER_TYPE_VF, - HCLGE_FILTER_FE_EGRESS_V1_B, - true, 0); + HCLGE_FILTER_FE_EGRESS, true, + vport->vport_id); if (ret) return ret; + vport->cur_vlan_fltr_en = true; }
- hdev->vlan_type_cfg.rx_in_fst_vlan_type = HCLGE_DEF_VLAN_TYPE; - hdev->vlan_type_cfg.rx_in_sec_vlan_type = HCLGE_DEF_VLAN_TYPE; - hdev->vlan_type_cfg.rx_ot_fst_vlan_type = HCLGE_DEF_VLAN_TYPE; - hdev->vlan_type_cfg.rx_ot_sec_vlan_type = HCLGE_DEF_VLAN_TYPE; - hdev->vlan_type_cfg.tx_ot_vlan_type = HCLGE_DEF_VLAN_TYPE; - hdev->vlan_type_cfg.tx_in_vlan_type = HCLGE_DEF_VLAN_TYPE; + return hclge_set_vlan_filter_ctrl(hdev, HCLGE_FILTER_TYPE_PORT, + HCLGE_FILTER_FE_INGRESS, true, 0); +}
- ret = hclge_set_vlan_protocol_type(hdev); - if (ret) - return ret; +static int hclge_init_vlan_type(struct hclge_dev *hdev) +{ + hdev->vlan_type_cfg.rx_in_fst_vlan_type = ETH_P_8021Q; + hdev->vlan_type_cfg.rx_in_sec_vlan_type = ETH_P_8021Q; + hdev->vlan_type_cfg.rx_ot_fst_vlan_type = ETH_P_8021Q; + hdev->vlan_type_cfg.rx_ot_sec_vlan_type = ETH_P_8021Q; + hdev->vlan_type_cfg.tx_ot_vlan_type = ETH_P_8021Q; + hdev->vlan_type_cfg.tx_in_vlan_type = ETH_P_8021Q;
- for (i = 0; i < hdev->num_alloc_vport; i++) { - u16 vlan_tag; - u8 qos; + return hclge_set_vlan_protocol_type(hdev); +}
+static int hclge_init_vport_vlan_offload(struct hclge_dev *hdev) +{ + struct hclge_port_base_vlan_config *cfg; + struct hclge_vport *vport; + int ret; + int i; + + for (i = 0; i < hdev->num_alloc_vport; i++) { vport = &hdev->vport[i]; - vlan_tag = vport->port_base_vlan_cfg.vlan_info.vlan_tag; - qos = vport->port_base_vlan_cfg.vlan_info.qos; + cfg = &vport->port_base_vlan_cfg;
- ret = hclge_vlan_offload_cfg(vport, - vport->port_base_vlan_cfg.state, - vlan_tag, qos); + ret = hclge_vlan_offload_cfg(vport, cfg->state, + cfg->vlan_info.vlan_tag, + cfg->vlan_info.qos); if (ret) return ret; } + return 0; +} + +static int hclge_init_vlan_config(struct hclge_dev *hdev) +{ + struct hnae3_handle *handle = &hdev->vport[0].nic; + int ret; + + ret = hclge_init_vlan_filter(hdev); + if (ret) + return ret; + + ret = hclge_init_vlan_type(hdev); + if (ret) + return ret; + + ret = hclge_init_vport_vlan_offload(hdev); + if (ret) + return ret;
return hclge_set_vlan_filter(handle, htons(ETH_P_8021Q), 0, false); }
From: Jian Shen shenjian15@huawei.com
mainline inclusion from mainline-master commit a41fb3961d8d category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Currently the function hclge_get_fd_rule_info() is a bit long. Split it to several small functions, to improve readability.
Signed-off-by: Jian Shen shenjian15@huawei.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../hisilicon/hns3/hns3pf/hclge_main.c | 52 ++++++++++++------- 1 file changed, 34 insertions(+), 18 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 117974bbfbb7..0801a7d0d482 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -7172,6 +7172,37 @@ static void hclge_fd_get_ext_info(struct ethtool_rx_flow_spec *fs, } }
+static struct hclge_fd_rule *hclge_get_fd_rule(struct hclge_dev *hdev, + u16 location) +{ + struct hclge_fd_rule *rule = NULL; + struct hlist_node *node2; + + hlist_for_each_entry_safe(rule, node2, &hdev->fd_rule_list, rule_node) { + if (rule->location == location) + return rule; + else if (rule->location > location) + return NULL; + } + + return NULL; +} + +static void hclge_fd_get_ring_cookie(struct ethtool_rx_flow_spec *fs, + struct hclge_fd_rule *rule) +{ + if (rule->action == HCLGE_FD_ACTION_DROP_PACKET) { + fs->ring_cookie = RX_CLS_FLOW_DISC; + } else { + u64 vf_id; + + fs->ring_cookie = rule->queue_id; + vf_id = rule->vf_id; + vf_id <<= ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF; + fs->ring_cookie |= vf_id; + } +} + static int hclge_get_fd_rule_info(struct hnae3_handle *handle, struct ethtool_rxnfc *cmd) { @@ -7179,7 +7210,6 @@ static int hclge_get_fd_rule_info(struct hnae3_handle *handle, struct hclge_fd_rule *rule = NULL; struct hclge_dev *hdev = vport->back; struct ethtool_rx_flow_spec *fs; - struct hlist_node *node2;
if (!hnae3_dev_fd_supported(hdev)) return -EOPNOTSUPP; @@ -7188,14 +7218,9 @@ static int hclge_get_fd_rule_info(struct hnae3_handle *handle,
spin_lock_bh(&hdev->fd_rule_lock);
- hlist_for_each_entry_safe(rule, node2, &hdev->fd_rule_list, rule_node) { - if (rule->location >= fs->location) - break; - } - - if (!rule || fs->location != rule->location) { + rule = hclge_get_fd_rule(hdev, fs->location); + if (!rule) { spin_unlock_bh(&hdev->fd_rule_lock); - return -ENOENT; }
@@ -7233,16 +7258,7 @@ static int hclge_get_fd_rule_info(struct hnae3_handle *handle,
hclge_fd_get_ext_info(fs, rule);
- if (rule->action == HCLGE_FD_ACTION_DROP_PACKET) { - fs->ring_cookie = RX_CLS_FLOW_DISC; - } else { - u64 vf_id; - - fs->ring_cookie = rule->queue_id; - vf_id = rule->vf_id; - vf_id <<= ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF; - fs->ring_cookie |= vf_id; - } + hclge_fd_get_ring_cookie(fs, rule);
spin_unlock_bh(&hdev->fd_rule_lock);
From: Yufeng Mo moyufeng@huawei.com
mainline inclusion from mainline-master commit 8d4b409bac57 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Function hns3_nic_net_xmit() is a bit too long. So add a new function hns3_handle_skb_desc() to simplify code and improve code readability.
Signed-off-by: Yufeng Mo moyufeng@huawei.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../net/ethernet/hisilicon/hns3/hns3_enet.c | 47 +++++++++++-------- 1 file changed, 28 insertions(+), 19 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 014bcfe8898d..aa513567f066 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -1636,7 +1636,6 @@ static int hns3_fill_skb_desc(struct hns3_enet_ring *ring, struct hns3_desc_cb *desc_cb) { struct hns3_desc_param param; - u8 fd_op; int ret;
hns3_init_desc_data(skb, ¶m); @@ -2184,15 +2183,39 @@ static int hns3_handle_desc_filling(struct hns3_enet_ring *ring, return hns3_fill_skb_to_desc(ring, skb, DESC_TYPE_SKB); }
+static int hns3_handle_skb_desc(struct hns3_enet_ring *ring, + struct sk_buff *skb, + struct hns3_desc_cb *desc_cb, + int next_to_use_head) +{ + int ret; + + ret = hns3_fill_skb_desc(ring, skb, &ring->desc[ring->next_to_use], + desc_cb); + if (unlikely(ret < 0)) + goto fill_err; + + /* 'ret < 0' means filling error, 'ret == 0' means skb->len is + * zero, which is unlikely, and 'ret > 0' means how many tx desc + * need to be notified to the hw. + */ + ret = hns3_handle_desc_filling(ring, skb); + if (likely(ret > 0)) + return ret; + +fill_err: + hns3_clear_desc(ring, next_to_use_head); + return ret; +} + netdev_tx_t hns3_nic_net_xmit(struct sk_buff *skb, struct net_device *netdev) { struct hns3_nic_priv *priv = netdev_priv(netdev); struct hns3_enet_ring *ring = &priv->ring[skb->queue_mapping]; struct hns3_desc_cb *desc_cb = &ring->desc_cb[ring->next_to_use]; struct netdev_queue *dev_queue; - int pre_ntu, next_to_use_head; + int pre_ntu, ret; bool doorbell; - int ret;
/* Hardware can only handle short frames above 32 bytes */ if (skb_put_padto(skb, HNS3_MIN_TX_LEN)) { @@ -2217,20 +2240,9 @@ netdev_tx_t hns3_nic_net_xmit(struct sk_buff *skb, struct net_device *netdev) goto out_err_tx_ok; }
- next_to_use_head = ring->next_to_use; - - ret = hns3_fill_skb_desc(ring, skb, &ring->desc[ring->next_to_use], - desc_cb); - if (unlikely(ret < 0)) - goto fill_err; - - /* 'ret < 0' means filling error, 'ret == 0' means skb->len is - * zero, which is unlikely, and 'ret > 0' means how many tx desc - * need to be notified to the hw. - */ - ret = hns3_handle_desc_filling(ring, skb); + ret = hns3_handle_skb_desc(ring, skb, desc_cb, ring->next_to_use); if (unlikely(ret <= 0)) - goto fill_err; + goto out_err_tx_ok;
pre_ntu = ring->next_to_use ? (ring->next_to_use - 1) : (ring->desc_num - 1); @@ -2252,9 +2264,6 @@ netdev_tx_t hns3_nic_net_xmit(struct sk_buff *skb, struct net_device *netdev)
return NETDEV_TX_OK;
-fill_err: - hns3_clear_desc(ring, next_to_use_head); - out_err_tx_ok: dev_kfree_skb_any(skb); hns3_tx_doorbell(ring, 0, !netdev_xmit_more());
From: Jian Shen shenjian15@huawei.com
mainline inclusion from mainline-master commit d25f5eddbe1a category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Currently the function hclge_update_port_base_vlan_cfg() is a bit long. Split it to several small functions, to improve the readability.
Signed-off-by: Jian Shen shenjian15@huawei.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../hisilicon/hns3/hns3pf/hclge_main.c | 69 ++++++++++--------- 1 file changed, 36 insertions(+), 33 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 0801a7d0d482..7da6b4252cb7 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -10540,12 +10540,41 @@ static bool hclge_need_update_vlan_filter(const struct hclge_vlan_info *new_cfg, return false; }
+static int hclge_modify_port_base_vlan_tag(struct hclge_vport *vport, + struct hclge_vlan_info *new_info, + struct hclge_vlan_info *old_info) +{ + struct hclge_dev *hdev = vport->back; + int ret; + + /* add new VLAN tag */ + ret = hclge_set_vlan_filter_hw(hdev, htons(new_info->vlan_proto), + vport->vport_id, new_info->vlan_tag, + false); + if (ret) + return ret; + + /* remove old VLAN tag */ + if (old_info->vlan_tag == 0) + ret = hclge_set_vf_vlan_common(hdev, vport->vport_id, + true, 0); + else + ret = hclge_set_vlan_filter_hw(hdev, htons(ETH_P_8021Q), + vport->vport_id, + old_info->vlan_tag, true); + if (ret) + dev_err(&hdev->pdev->dev, + "failed to clear vport%u port base vlan %u, ret = %d.\n", + vport->vport_id, old_info->vlan_tag, ret); + + return ret; +} + int hclge_update_port_base_vlan_cfg(struct hclge_vport *vport, u16 state, struct hclge_vlan_info *vlan_info) { struct hnae3_handle *nic = &vport->nic; struct hclge_vlan_info *old_vlan_info; - struct hclge_dev *hdev = vport->back; int ret;
old_vlan_info = &vport->port_base_vlan_cfg.vlan_info; @@ -10558,38 +10587,12 @@ int hclge_update_port_base_vlan_cfg(struct hclge_vport *vport, u16 state, if (!hclge_need_update_vlan_filter(vlan_info, old_vlan_info)) goto out;
- if (state == HNAE3_PORT_BASE_VLAN_MODIFY) { - /* add new VLAN tag */ - ret = hclge_set_vlan_filter_hw(hdev, - htons(vlan_info->vlan_proto), - vport->vport_id, - vlan_info->vlan_tag, - false); - if (ret) - return ret; - - /* remove old VLAN tag */ - if (old_vlan_info->vlan_tag == 0) - ret = hclge_set_vf_vlan_common(hdev, vport->vport_id, - true, 0); - else - ret = hclge_set_vlan_filter_hw(hdev, - htons(ETH_P_8021Q), - vport->vport_id, - old_vlan_info->vlan_tag, - true); - if (ret) { - dev_err(&hdev->pdev->dev, - "failed to clear vport%u port base vlan %u, ret = %d.\n", - vport->vport_id, old_vlan_info->vlan_tag, ret); - return ret; - } - - goto out; - } - - ret = hclge_update_vlan_filter_entries(vport, state, vlan_info, - old_vlan_info); + if (state == HNAE3_PORT_BASE_VLAN_MODIFY) + ret = hclge_modify_port_base_vlan_tag(vport, vlan_info, + old_vlan_info); + else + ret = hclge_update_vlan_filter_entries(vport, state, vlan_info, + old_vlan_info); if (ret) return ret;
From: Jie Wang wangjie125@huawei.com
mainline inclusion from mainline-master commit 673b35b6a5bf category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Currently hclge_configure() is a bit long. Refactor it by extracting sub process to improve the readability.
Signed-off-by: Jie Wang wangjie125@huawei.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../hisilicon/hns3/hns3pf/hclge_main.c | 53 ++++++++++--------- 1 file changed, 29 insertions(+), 24 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 7da6b4252cb7..427c4974dea3 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -1613,12 +1613,39 @@ static void hclge_init_kdump_kernel_config(struct hclge_dev *hdev) hdev->num_rx_desc = HCLGE_MIN_RX_DESC; }
+static void hclge_init_tc_config(struct hclge_dev *hdev) +{ + unsigned int i; + + if (hdev->tc_max > HNAE3_MAX_TC || + hdev->tc_max < 1) { + dev_warn(&hdev->pdev->dev, "TC num = %u.\n", + hdev->tc_max); + hdev->tc_max = 1; + } + + /* Dev does not support DCB */ + if (!hnae3_dev_dcb_supported(hdev)) { + hdev->tc_max = 1; + hdev->pfc_max = 0; + } else { + hdev->pfc_max = hdev->tc_max; + } + + hdev->tm_info.num_tc = 1; + + /* Currently not support uncontiuous tc */ + for (i = 0; i < hdev->tm_info.num_tc; i++) + hnae3_set_bit(hdev->hw_tc_map, i, 1); + + hdev->tx_sch_mode = HCLGE_FLAG_TC_BASE_SCH_MODE; +} + static int hclge_configure(struct hclge_dev *hdev) { struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev); const struct cpumask *cpumask = cpu_online_mask; struct hclge_cfg cfg; - unsigned int i; int node, ret;
ret = hclge_get_cfg(hdev, &cfg); @@ -1662,29 +1689,7 @@ static int hclge_configure(struct hclge_dev *hdev)
hdev->hw.mac.max_speed = hclge_get_max_speed(cfg.speed_ability);
- if ((hdev->tc_max > HNAE3_MAX_TC) || - (hdev->tc_max < 1)) { - dev_warn(&hdev->pdev->dev, "TC num = %u.\n", - hdev->tc_max); - hdev->tc_max = 1; - } - - /* Dev does not support DCB */ - if (!hnae3_dev_dcb_supported(hdev)) { - hdev->tc_max = 1; - hdev->pfc_max = 0; - } else { - hdev->pfc_max = hdev->tc_max; - } - - hdev->tm_info.num_tc = 1; - - /* Currently not support uncontiuous tc */ - for (i = 0; i < hdev->tm_info.num_tc; i++) - hnae3_set_bit(hdev->hw_tc_map, i, 1); - - hdev->tx_sch_mode = HCLGE_FLAG_TC_BASE_SCH_MODE; - + hclge_init_tc_config(hdev); hclge_init_kdump_kernel_config(hdev);
/* Set the affinity based on numa node */
From: Jie Wang wangjie125@huawei.com
mainline inclusion from mainline-master commit 358e3edb31d5 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Currently hclge_set_channels() is a bit long. Refactor it by extracting sub process to improve the readability.
Signed-off-by: Jie Wang wangjie125@huawei.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../hisilicon/hns3/hns3pf/hclge_main.c | 46 +++++++++++-------- 1 file changed, 28 insertions(+), 18 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 427c4974dea3..5d1cfd5d1fd5 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -12347,19 +12347,42 @@ static void hclge_get_tqps_and_rss_info(struct hnae3_handle *handle, *max_rss_size = hdev->pf_rss_size_max; }
+static int hclge_set_rss_tc_mode_cfg(struct hnae3_handle *handle) +{ + struct hclge_vport *vport = hclge_get_vport(handle); + u16 tc_offset[HCLGE_MAX_TC_NUM] = {0}; + struct hclge_dev *hdev = vport->back; + u16 tc_size[HCLGE_MAX_TC_NUM] = {0}; + u16 tc_valid[HCLGE_MAX_TC_NUM]; + u16 roundup_size; + unsigned int i; + + roundup_size = roundup_pow_of_two(vport->nic.kinfo.rss_size); + roundup_size = ilog2(roundup_size); + /* Set the RSS TC mode according to the new RSS size */ + for (i = 0; i < HCLGE_MAX_TC_NUM; i++) { + tc_valid[i] = 0; + + if (!(hdev->hw_tc_map & BIT(i))) + continue; + + tc_valid[i] = 1; + tc_size[i] = roundup_size; + tc_offset[i] = vport->nic.kinfo.rss_size * i; + } + + return hclge_set_rss_tc_mode(hdev, tc_valid, tc_size, tc_offset); +} + static int hclge_set_channels(struct hnae3_handle *handle, u32 new_tqps_num, bool rxfh_configured) { struct hnae3_ae_dev *ae_dev = pci_get_drvdata(handle->pdev); struct hclge_vport *vport = hclge_get_vport(handle); struct hnae3_knic_private_info *kinfo = &vport->nic.kinfo; - u16 tc_offset[HCLGE_MAX_TC_NUM] = {0}; struct hclge_dev *hdev = vport->back; - u16 tc_size[HCLGE_MAX_TC_NUM] = {0}; u16 cur_rss_size = kinfo->rss_size; u16 cur_tqps = kinfo->num_tqps; - u16 tc_valid[HCLGE_MAX_TC_NUM]; - u16 roundup_size; u32 *rss_indir; unsigned int i; int ret; @@ -12372,20 +12395,7 @@ static int hclge_set_channels(struct hnae3_handle *handle, u32 new_tqps_num, return ret; }
- roundup_size = roundup_pow_of_two(kinfo->rss_size); - roundup_size = ilog2(roundup_size); - /* Set the RSS TC mode according to the new RSS size */ - for (i = 0; i < HCLGE_MAX_TC_NUM; i++) { - tc_valid[i] = 0; - - if (!(hdev->hw_tc_map & BIT(i))) - continue; - - tc_valid[i] = 1; - tc_size[i] = roundup_size; - tc_offset[i] = kinfo->rss_size * i; - } - ret = hclge_set_rss_tc_mode(hdev, tc_valid, tc_size, tc_offset); + ret = hclge_set_rss_tc_mode_cfg(handle); if (ret) return ret;
From: Jie Wang wangjie125@huawei.com
mainline inclusion from mainline-master commit 1b33341e3dc0 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Currently hns3_get_vector_ring_chain() is a bit long. Refactor it by extracting sub process to improve the readability.
Signed-off-by: Jie Wang wangjie125@huawei.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../net/ethernet/hisilicon/hns3/hns3_enet.c | 121 ++++++++---------- 1 file changed, 53 insertions(+), 68 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index aa513567f066..c1444dd93195 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -4341,87 +4341,70 @@ static int hns3_nic_common_poll(struct napi_struct *napi, int budget) return rx_pkt_total; }
-static int hns3_get_vector_ring_chain(struct hns3_enet_tqp_vector *tqp_vector, - struct hnae3_ring_chain_node *head) +static int hns3_create_ring_chain(struct hns3_enet_tqp_vector *tqp_vector, + struct hnae3_ring_chain_node **head, + bool is_tx) { + u32 bit_value = is_tx ? HNAE3_RING_TYPE_TX : HNAE3_RING_TYPE_RX; + u32 field_value = is_tx ? HNAE3_RING_GL_TX : HNAE3_RING_GL_RX; + struct hnae3_ring_chain_node *cur_chain = *head; struct pci_dev *pdev = tqp_vector->handle->pdev; - struct hnae3_ring_chain_node *cur_chain = head; struct hnae3_ring_chain_node *chain; - struct hns3_enet_ring *tx_ring; - struct hns3_enet_ring *rx_ring; - - tx_ring = tqp_vector->tx_group.ring; - if (tx_ring) { - cur_chain->tqp_index = tx_ring->tqp->tqp_index; - hnae3_set_bit(cur_chain->flag, HNAE3_RING_TYPE_B, - HNAE3_RING_TYPE_TX); - hnae3_set_field(cur_chain->int_gl_idx, HNAE3_RING_GL_IDX_M, - HNAE3_RING_GL_IDX_S, HNAE3_RING_GL_TX); - - cur_chain->next = NULL; - - while (tx_ring->next) { - tx_ring = tx_ring->next; - - chain = devm_kzalloc(&pdev->dev, sizeof(*chain), - GFP_KERNEL); - if (!chain) - goto err_free_chain; - - cur_chain->next = chain; - chain->tqp_index = tx_ring->tqp->tqp_index; - hnae3_set_bit(chain->flag, HNAE3_RING_TYPE_B, - HNAE3_RING_TYPE_TX); - hnae3_set_field(chain->int_gl_idx, - HNAE3_RING_GL_IDX_M, - HNAE3_RING_GL_IDX_S, - HNAE3_RING_GL_TX); - - cur_chain = chain; - } - } + struct hns3_enet_ring *ring;
- rx_ring = tqp_vector->rx_group.ring; - if (!tx_ring && rx_ring) { - cur_chain->next = NULL; - cur_chain->tqp_index = rx_ring->tqp->tqp_index; - hnae3_set_bit(cur_chain->flag, HNAE3_RING_TYPE_B, - HNAE3_RING_TYPE_RX); - hnae3_set_field(cur_chain->int_gl_idx, HNAE3_RING_GL_IDX_M, - HNAE3_RING_GL_IDX_S, HNAE3_RING_GL_RX); + ring = is_tx ? tqp_vector->tx_group.ring : tqp_vector->rx_group.ring;
- rx_ring = rx_ring->next; + if (cur_chain) { + while (cur_chain->next) + cur_chain = cur_chain->next; }
- while (rx_ring) { + while (ring) { chain = devm_kzalloc(&pdev->dev, sizeof(*chain), GFP_KERNEL); if (!chain) - goto err_free_chain; - - cur_chain->next = chain; - chain->tqp_index = rx_ring->tqp->tqp_index; + return -ENOMEM; + if (cur_chain) + cur_chain->next = chain; + else + *head = chain; + chain->tqp_index = ring->tqp->tqp_index; hnae3_set_bit(chain->flag, HNAE3_RING_TYPE_B, - HNAE3_RING_TYPE_RX); - hnae3_set_field(chain->int_gl_idx, HNAE3_RING_GL_IDX_M, - HNAE3_RING_GL_IDX_S, HNAE3_RING_GL_RX); + bit_value); + hnae3_set_field(chain->int_gl_idx, + HNAE3_RING_GL_IDX_M, + HNAE3_RING_GL_IDX_S, field_value);
cur_chain = chain;
- rx_ring = rx_ring->next; + ring = ring->next; }
return 0; +} + +static struct hnae3_ring_chain_node * +hns3_get_vector_ring_chain(struct hns3_enet_tqp_vector *tqp_vector) +{ + struct pci_dev *pdev = tqp_vector->handle->pdev; + struct hnae3_ring_chain_node *cur_chain = NULL; + struct hnae3_ring_chain_node *chain; + + if (hns3_create_ring_chain(tqp_vector, &cur_chain, true)) + goto err_free_chain; + + if (hns3_create_ring_chain(tqp_vector, &cur_chain, false)) + goto err_free_chain; + + return cur_chain;
err_free_chain: - cur_chain = head->next; while (cur_chain) { chain = cur_chain->next; devm_kfree(&pdev->dev, cur_chain); cur_chain = chain; } - head->next = NULL;
- return -ENOMEM; + return NULL; }
static void hns3_free_vector_ring_chain(struct hns3_enet_tqp_vector *tqp_vector, @@ -4430,7 +4413,7 @@ static void hns3_free_vector_ring_chain(struct hns3_enet_tqp_vector *tqp_vector, struct pci_dev *pdev = tqp_vector->handle->pdev; struct hnae3_ring_chain_node *chain_tmp, *chain;
- chain = head->next; + chain = head;
while (chain) { chain_tmp = chain->next; @@ -4545,7 +4528,7 @@ static int hns3_nic_init_vector_data(struct hns3_nic_priv *priv) }
for (i = 0; i < priv->vector_num; i++) { - struct hnae3_ring_chain_node vector_ring_chain; + struct hnae3_ring_chain_node *vector_ring_chain;
tqp_vector = &priv->tqp_vector[i];
@@ -4555,15 +4538,16 @@ static int hns3_nic_init_vector_data(struct hns3_nic_priv *priv) tqp_vector->tx_group.total_packets = 0; tqp_vector->handle = h;
- ret = hns3_get_vector_ring_chain(tqp_vector, - &vector_ring_chain); - if (ret) + vector_ring_chain = hns3_get_vector_ring_chain(tqp_vector); + if (!vector_ring_chain) { + ret = -ENOMEM; goto map_ring_fail; + }
ret = h->ae_algo->ops->map_ring_to_vector(h, - tqp_vector->vector_irq, &vector_ring_chain); + tqp_vector->vector_irq, vector_ring_chain);
- hns3_free_vector_ring_chain(tqp_vector, &vector_ring_chain); + hns3_free_vector_ring_chain(tqp_vector, vector_ring_chain);
if (ret) goto map_ring_fail; @@ -4662,7 +4646,7 @@ static void hns3_clear_ring_group(struct hns3_enet_ring_group *group)
static void hns3_nic_uninit_vector_data(struct hns3_nic_priv *priv) { - struct hnae3_ring_chain_node vector_ring_chain; + struct hnae3_ring_chain_node *vector_ring_chain; struct hnae3_handle *h = priv->ae_handle; struct hns3_enet_tqp_vector *tqp_vector; int i; @@ -4677,13 +4661,14 @@ static void hns3_nic_uninit_vector_data(struct hns3_nic_priv *priv) * chain between vector and ring, we should go on to deal with * the remaining options. */ - if (hns3_get_vector_ring_chain(tqp_vector, &vector_ring_chain)) + vector_ring_chain = hns3_get_vector_ring_chain(tqp_vector); + if (!vector_ring_chain) dev_warn(priv->dev, "failed to get ring chain\n");
h->ae_algo->ops->unmap_ring_from_vector(h, - tqp_vector->vector_irq, &vector_ring_chain); + tqp_vector->vector_irq, vector_ring_chain);
- hns3_free_vector_ring_chain(tqp_vector, &vector_ring_chain); + hns3_free_vector_ring_chain(tqp_vector, vector_ring_chain);
hns3_clear_ring_group(&tqp_vector->rx_group); hns3_clear_ring_group(&tqp_vector->tx_group);
From: Yufeng Mo moyufeng@huawei.com
mainline inclusion from mainline-master commit 23e0316049af category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
hclge_cfg_common_loopback() is a bit too long, so encapsulate hclge_cfg_common_loopback_cmd_send() and hclge_cfg_common_loopback_wait() two functions to improve readability.
Signed-off-by: Yufeng Mo moyufeng@huawei.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../hisilicon/hns3/hns3pf/hclge_main.c | 62 +++++++++++++------ 1 file changed, 42 insertions(+), 20 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 5d1cfd5d1fd5..3e8e4ad4c347 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -8000,16 +8000,13 @@ static int hclge_set_app_loopback(struct hclge_dev *hdev, bool en) return ret; }
-static int hclge_cfg_common_loopback(struct hclge_dev *hdev, bool en, - enum hnae3_loop loop_mode) +static int hclge_cfg_common_loopback_cmd_send(struct hclge_dev *hdev, bool en, + enum hnae3_loop loop_mode) { -#define HCLGE_COMMON_LB_RETRY_MS 10 -#define HCLGE_COMMON_LB_RETRY_NUM 100 - struct hclge_common_lb_cmd *req; struct hclge_desc desc; - int ret, i = 0; u8 loop_mode_b; + int ret;
req = (struct hclge_common_lb_cmd *)desc.data; hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_COMMON_LOOPBACK, false); @@ -8026,23 +8023,34 @@ static int hclge_cfg_common_loopback(struct hclge_dev *hdev, bool en, break; default: dev_err(&hdev->pdev->dev, - "unsupported common loopback mode %d\n", loop_mode); + "unsupported loopback mode %d\n", loop_mode); return -ENOTSUPP; }
- if (en) { + req->mask = loop_mode_b; + if (en) req->enable = loop_mode_b; - req->mask = loop_mode_b; - } else { - req->mask = loop_mode_b; - }
ret = hclge_cmd_send(&hdev->hw, &desc, 1); - if (ret) { + if (ret) dev_err(&hdev->pdev->dev, - "common loopback set fail, ret = %d\n", ret); - return ret; - } + "failed to send loopback cmd, loop_mode = %d, ret = %d\n", + loop_mode, ret); + + return ret; +} + +static int hclge_cfg_common_loopback_wait(struct hclge_dev *hdev) +{ +#define HCLGE_COMMON_LB_RETRY_MS 10 +#define HCLGE_COMMON_LB_RETRY_NUM 100 + + struct hclge_common_lb_cmd *req; + struct hclge_desc desc; + u32 i = 0; + int ret; + + req = (struct hclge_common_lb_cmd *)desc.data;
do { msleep(HCLGE_COMMON_LB_RETRY_MS); @@ -8051,20 +8059,34 @@ static int hclge_cfg_common_loopback(struct hclge_dev *hdev, bool en, ret = hclge_cmd_send(&hdev->hw, &desc, 1); if (ret) { dev_err(&hdev->pdev->dev, - "common loopback get, ret = %d\n", ret); + "failed to get loopback done status, ret = %d\n", + ret); return ret; } } while (++i < HCLGE_COMMON_LB_RETRY_NUM && !(req->result & HCLGE_CMD_COMMON_LB_DONE_B));
if (!(req->result & HCLGE_CMD_COMMON_LB_DONE_B)) { - dev_err(&hdev->pdev->dev, "common loopback set timeout\n"); + dev_err(&hdev->pdev->dev, "wait loopback timeout\n"); return -EBUSY; } else if (!(req->result & HCLGE_CMD_COMMON_LB_SUCCESS_B)) { - dev_err(&hdev->pdev->dev, "common loopback set failed in fw\n"); + dev_err(&hdev->pdev->dev, "faile to do loopback test\n"); return -EIO; } - return ret; + + return 0; +} + +static int hclge_cfg_common_loopback(struct hclge_dev *hdev, bool en, + enum hnae3_loop loop_mode) +{ + int ret; + + ret = hclge_cfg_common_loopback_cmd_send(hdev, en, loop_mode); + if (ret) + return ret; + + return hclge_cfg_common_loopback_wait(hdev); }
static int hclge_set_common_loopback(struct hclge_dev *hdev, bool en,
From: Guangbin Huang huangguangbin2@huawei.com
mainline inclusion from mainline-master commit e7a51bf590e3 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Function hclge_set_vlan_filter_hw() is a bit too long, so add a new function hclge_need_update_port_vlan() to simplify code and improve code readability.
Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../hisilicon/hns3/hns3pf/hclge_main.c | 45 +++++++++++-------- 1 file changed, 27 insertions(+), 18 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 3e8e4ad4c347..6e4b8bf5cf36 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -10005,6 +10005,32 @@ static int hclge_set_port_vlan_filter(struct hclge_dev *hdev, __be16 proto, return ret; }
+static bool hclge_need_update_port_vlan(struct hclge_dev *hdev, u16 vport_id, + u16 vlan_id, bool is_kill) +{ + /* vlan 0 may be added twice when 8021q module is enabled */ + if (!is_kill && !vlan_id && + test_bit(vport_id, hdev->vlan_table[vlan_id])) + return false; + + if (!is_kill && test_and_set_bit(vport_id, hdev->vlan_table[vlan_id])) { + dev_warn(&hdev->pdev->dev, + "Add port vlan failed, vport %u is already in vlan %u\n", + vport_id, vlan_id); + return false; + } + + if (is_kill && + !test_and_clear_bit(vport_id, hdev->vlan_table[vlan_id])) { + dev_warn(&hdev->pdev->dev, + "Delete port vlan failed, vport %u is not in vlan %u\n", + vport_id, vlan_id); + return false; + } + + return true; +} + static int hclge_set_vlan_filter_hw(struct hclge_dev *hdev, __be16 proto, u16 vport_id, u16 vlan_id, bool is_kill) @@ -10026,26 +10052,9 @@ static int hclge_set_vlan_filter_hw(struct hclge_dev *hdev, __be16 proto, return ret; }
- /* vlan 0 may be added twice when 8021q module is enabled */ - if (!is_kill && !vlan_id && - test_bit(vport_id, hdev->vlan_table[vlan_id])) + if (!hclge_need_update_port_vlan(hdev, vport_id, vlan_id, is_kill)) return 0;
- if (!is_kill && test_and_set_bit(vport_id, hdev->vlan_table[vlan_id])) { - dev_err(&hdev->pdev->dev, - "Add port vlan failed, vport %u is already in vlan %u\n", - vport_id, vlan_id); - return -EINVAL; - } - - if (is_kill && - !test_and_clear_bit(vport_id, hdev->vlan_table[vlan_id])) { - dev_err(&hdev->pdev->dev, - "Delete port vlan failed, vport %u is not in vlan %u\n", - vport_id, vlan_id); - return -EINVAL; - } - for_each_set_bit(vport_idx, hdev->vlan_table[vlan_id], HCLGE_VPORT_NUM) vport_num++;
From: Guangbin Huang huangguangbin2@huawei.com
mainline inclusion from mainline-master commit 114967adbc3d category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
This patch adds print vport id when failed to get or set vlan filter parameters.
Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 6e4b8bf5cf36..ae3ceaef70a0 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -9746,8 +9746,8 @@ static int hclge_set_vlan_filter_ctrl(struct hclge_dev *hdev, u8 vlan_type,
ret = hclge_cmd_send(&hdev->hw, &desc, 1); if (ret) { - dev_err(&hdev->pdev->dev, - "failed to get vlan filter config, ret = %d.\n", ret); + dev_err(&hdev->pdev->dev, "failed to get vport%u vlan filter config, ret = %d.\n", + vf_id, ret); return ret; }
@@ -9758,8 +9758,8 @@ static int hclge_set_vlan_filter_ctrl(struct hclge_dev *hdev, u8 vlan_type,
ret = hclge_cmd_send(&hdev->hw, &desc, 1); if (ret) - dev_err(&hdev->pdev->dev, "failed to set vlan filter, ret = %d.\n", - ret); + dev_err(&hdev->pdev->dev, "failed to set vport%u vlan filter, ret = %d.\n", + vf_id, ret);
return ret; }
From: Hao Chen chenhao288@hisilicon.com
mainline inclusion from mainline-master commit 0cc25c6a14ef category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
The c language has a set of implicit type conversions, when two variables perform bitwise or arithmetic operations.
For example, variable A (type u16/u8) -1, its output is int type variable. u16/u8 will convert to int type implicitly before it does arithmetic operations. So, change 1 to unsigned type.
Signed-off-by: Hao Chen chenhao288@hisilicon.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c | 2 +- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 4 ++-- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c index 65168125c42e..1d039e18dd72 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c @@ -99,7 +99,7 @@ static void hclge_dbg_fill_content(char *content, u16 len, static char *hclge_dbg_get_func_id_str(char *buf, u8 id) { if (id) - sprintf(buf, "vf%u", id - 1); + sprintf(buf, "vf%u", id - 1U); else sprintf(buf, "pf");
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index ae3ceaef70a0..e7c377378150 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -6806,7 +6806,7 @@ static int hclge_fd_parse_ring_cookie(struct hclge_dev *hdev, u64 ring_cookie, if (vf > hdev->num_req_vfs) { dev_err(&hdev->pdev->dev, "Error: vf id (%u) should be less than %u\n", - vf - 1, hdev->num_req_vfs); + vf - 1U, hdev->num_req_vfs); return -EINVAL; }
@@ -6816,7 +6816,7 @@ static int hclge_fd_parse_ring_cookie(struct hclge_dev *hdev, u64 ring_cookie, if (ring >= tqps) { dev_err(&hdev->pdev->dev, "Error: queue id (%u) > max tqp num (%u)\n", - ring, tqps - 1); + ring, tqps - 1U); return -EINVAL; }
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c index c495df2e5953..8b3954b39147 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c @@ -181,7 +181,7 @@ static int hclge_get_ring_chain_from_mbx( if (req->msg.param[i].tqp_index >= vport->nic.kinfo.rss_size) { dev_err(&hdev->pdev->dev, "tqp index(%u) is out of range(0-%u)\n", req->msg.param[i].tqp_index, - vport->nic.kinfo.rss_size - 1); + vport->nic.kinfo.rss_size - 1U); return -EINVAL; } }
From: Guangbin Huang huangguangbin2@huawei.com
mainline inclusion from mainline-master commit 72dcdec10fad category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
The argument len will not be changed in hclge_ncl_config_data_print(), it is no need to declare as a pointer, so modify it into int type.
Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c index 1d039e18dd72..1579ca336d06 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c @@ -1764,7 +1764,7 @@ hclge_dbg_get_imp_stats_info(struct hclge_dev *hdev, char *buf, int len) #define HCLGE_MAX_NCL_CONFIG_LENGTH 16384
static void hclge_ncl_config_data_print(struct hclge_desc *desc, int *index, - char *buf, int *len, int *pos) + char *buf, int len, int *pos) { #define HCLGE_CMD_DATA_NUM 6
@@ -1776,7 +1776,7 @@ static void hclge_ncl_config_data_print(struct hclge_desc *desc, int *index, if (i == 0 && j == 0) continue;
- *pos += scnprintf(buf + *pos, *len - *pos, + *pos += scnprintf(buf + *pos, len - *pos, "0x%04x | 0x%08x\n", offset, le32_to_cpu(desc[i].data[j]));
@@ -1814,7 +1814,7 @@ hclge_dbg_dump_ncl_config(struct hclge_dev *hdev, char *buf, int len) if (ret) return ret;
- hclge_ncl_config_data_print(desc, &index, buf, &len, &pos); + hclge_ncl_config_data_print(desc, &index, buf, len, &pos); }
return 0;
From: Hao Chen chenhao288@hisilicon.com
mainline inclusion from mainline-master commit 9fcadbaae8ea category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Change output value type of atomic_read() from %u to %d.
Signed-off-by: Hao Chen chenhao288@hisilicon.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c index 081295bff765..817e2e8a7287 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c @@ -1083,7 +1083,7 @@ static void hns3_dump_page_pool_info(struct hns3_enet_ring *ring, sprintf(result[j++], "%u", index); sprintf(result[j++], "%u", READ_ONCE(ring->page_pool->pages_state_hold_cnt)); - sprintf(result[j++], "%u", + sprintf(result[j++], "%d", atomic_read(&ring->page_pool->pages_state_release_cnt)); sprintf(result[j++], "%u", ring->page_pool->p.pool_size); sprintf(result[j++], "%u", ring->page_pool->p.order);
From: Hao Chen chenhao288@hisilicon.com
mainline inclusion from mainline-master commit 4e599dddeea4 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
When we use hclge_dbg_fill_content() to fill contents with specific format according to struct hclge_dbg_item *items, it may cause content cover due to unreasonable items.
So add comments to explain how to avoid it.
Signed-off-by: Hao Chen chenhao288@hisilicon.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c index 1579ca336d06..2557e815cb3d 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c @@ -77,6 +77,10 @@ static const struct hclge_dbg_reg_type_info hclge_dbg_reg_info[] = { .cmd = HCLGE_OPC_DFX_TQP_REG } }, };
+/* make sure: len(name) + interval >= maxlen(item data) + 2, + * for example, name = "pkt_num"(len: 7), the prototype of item data is u32, + * and print as "%u"(maxlen: 10), so the interval should be at least 5. + */ static void hclge_dbg_fill_content(char *content, u16 len, const struct hclge_dbg_item *items, const char **result, u16 size)
From: Hao Chen chenhao288@hisilicon.com
mainline inclusion from mainline-master commit 40975e749daa category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Return value judgment should follow the function call, so remove line between them.
Signed-off-by: Hao Chen chenhao288@hisilicon.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c | 1 - 1 file changed, 1 deletion(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c index 2557e815cb3d..c287be8bc48d 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c @@ -782,7 +782,6 @@ static int hclge_dbg_dump_tm_pg(struct hclge_dev *hdev, char *buf, int len)
data_str = kcalloc(ARRAY_SIZE(tm_pg_items), HCLGE_DBG_DATA_STR_LEN, GFP_KERNEL); - if (!data_str) return -ENOMEM;
From: Hao Chen chenhao288@hisilicon.com
mainline inclusion from mainline-master commit 7acf76b1cd01 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Replace one tab with space between symbol ')' and '{' in for statement of function hclge_map_tqp().
Signed-off-by: Hao Chen chenhao288@hisilicon.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index e7c377378150..e182f38c6495 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -1890,7 +1890,7 @@ static int hclge_map_tqp(struct hclge_dev *hdev) u16 i, num_vport;
num_vport = hdev->num_req_vfs + 1; - for (i = 0; i < num_vport; i++) { + for (i = 0; i < num_vport; i++) { int ret;
ret = hclge_map_tqp_to_vport(hdev, vport);
From: Jie Wang wangjie125@huawei.com
mainline inclusion from mainline-master commit 184da9dc780e category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
The hns3 driver header file uses the structure of other files, but does not include corresponding file, which causes a check warning that the header file is not self-contained.
Therefore, the required header file is included in the header file, and the structure declaration is added to the header file to avoid cyclic dependency of the header file.
Signed-off-by: Jie Wang wangjie125@huawei.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.h | 2 ++ drivers/net/ethernet/hisilicon/hns3/hns3_enet.h | 3 +++ drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.h | 4 ++++ drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.h | 3 +++ drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h | 6 ++++++ 5 files changed, 18 insertions(+)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.h b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.h index bd8801065e02..83aa1450ab9f 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.h @@ -4,6 +4,8 @@ #ifndef __HNS3_DEBUGFS_H #define __HNS3_DEBUGFS_H
+#include "hnae3.h" + #define HNS3_DBG_READ_LEN 65536 #define HNS3_DBG_READ_LEN_128KB 0x20000 #define HNS3_DBG_READ_LEN_1MB 0x100000 diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index 2803b2cd7f30..a05a0c7423ce 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -10,6 +10,9 @@
#include "hnae3.h"
+struct iphdr; +struct ipv6hdr; + enum hns3_nic_state { HNS3_NIC_STATE_TESTING, HNS3_NIC_STATE_RESETTING, diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.h index fd0e20190b90..4200d0b6d931 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.h @@ -4,6 +4,10 @@ #ifndef __HCLGE_MDIO_H #define __HCLGE_MDIO_H
+#include "hnae3.h" + +struct hclge_dev; + int hclge_mac_mdio_config(struct hclge_dev *hdev); int hclge_mac_connect_phy(struct hnae3_handle *handle); void hclge_mac_disconnect_phy(struct hnae3_handle *handle); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.h index 7a9b77de632a..bbee74cd8404 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.h @@ -8,6 +8,9 @@ #include <linux/net_tstamp.h> #include <linux/types.h>
+struct hclge_dev; +struct ifreq; + #define HCLGE_PTP_REG_OFFSET 0x29000
#define HCLGE_PTP_TX_TS_SEQID_REG 0x0 diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h index 1db7f40b4525..619cc30a2dfc 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h @@ -6,6 +6,12 @@
#include <linux/types.h>
+#include "hnae3.h" + +struct hclge_dev; +struct hclge_vport; +enum hclge_opcode_type; + /* MAC Pause */ #define HCLGE_TX_MAC_PAUSE_EN_MSK BIT(0) #define HCLGE_RX_MAC_PAUSE_EN_MSK BIT(1)
From: Colin Ian King colin.i.king@gmail.com
mainline inclusion from mainline-master commit 3c5290a2dcdb category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4M1HB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
There is a spelling mistake in a dev_err message. Fix it.
Signed-off-by: Colin Ian King colin.i.king@gmail.com Link: https://lore.kernel.org/r/20211206091207.113648-1-colin.i.king@gmail.com Signed-off-by: Jakub Kicinski kuba@kernel.org Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index e182f38c6495..9f0c0b90ef43 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -8070,7 +8070,7 @@ static int hclge_cfg_common_loopback_wait(struct hclge_dev *hdev) dev_err(&hdev->pdev->dev, "wait loopback timeout\n"); return -EBUSY; } else if (!(req->result & HCLGE_CMD_COMMON_LB_SUCCESS_B)) { - dev_err(&hdev->pdev->dev, "faile to do loopback test\n"); + dev_err(&hdev->pdev->dev, "failed to do loopback test\n"); return -EIO; }
From: Brian Foster bfoster@redhat.com
mainline-inclusion from mainline-v5.12-rc4 commit 7cd3099f4925d7c15887d1940ebd65acd66100f5 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Per-inode ioend completion batching has a log reservation deadlock vector between preallocated append transactions and transactions that are acquired at completion time for other purposes (i.e., unwritten extent conversion or COW fork remaps). For example, if the ioend completion workqueue task executes on a batch of ioends that are sorted such that an append ioend sits at the tail, it's possible for the outstanding append transaction reservation to block allocation of transactions required to process preceding ioends in the list.
Append ioend completion is historically the common path for on-disk inode size updates. While file extending writes may have completed sometime earlier, the on-disk inode size is only updated after successful writeback completion. These transactions are preallocated serially from writeback context to mitigate concurrency and associated log reservation pressure across completions processed by multi-threaded workqueue tasks.
However, now that delalloc blocks unconditionally map to unwritten extents at physical block allocation time, size updates via append ioends are relatively rare. This means that inode size updates most commonly occur as part of the preexisting completion time transaction to convert unwritten extents. As a result, there is no longer a strong need to preallocate size update transactions.
Remove the preallocation of inode size update transactions to avoid the ioend completion processing log reservation deadlock. Instead, continue to send all potential size extending ioends to workqueue context for completion and allocate the transaction from that context. This ensures that no outstanding log reservation is owned by the ioend completion worker task when it begins to process ioends.
Signed-off-by: Brian Foster bfoster@redhat.com Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_aops.c | 45 +++------------------------------------------ 1 file changed, 3 insertions(+), 42 deletions(-)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 4304c6416fbb..7ffbeb14a36b 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -39,33 +39,6 @@ static inline bool xfs_ioend_is_append(struct iomap_ioend *ioend) XFS_I(ioend->io_inode)->i_d.di_size; }
-STATIC int -xfs_setfilesize_trans_alloc( - struct iomap_ioend *ioend) -{ - struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; - struct xfs_trans *tp; - int error; - - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp); - if (error) - return error; - - ioend->io_private = tp; - - /* - * We may pass freeze protection with a transaction. So tell lockdep - * we released it. - */ - __sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS); - /* - * We hand off the transaction to the completion thread now, so - * clear the flag here. - */ - current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); - return 0; -} - /* * Update on-disk file size now that data has been written to disk. */ @@ -182,12 +155,10 @@ xfs_end_ioend( error = xfs_reflink_end_cow(ip, offset, size); else if (ioend->io_type == IOMAP_UNWRITTEN) error = xfs_iomap_write_unwritten(ip, offset, size, false); - else - ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_private);
+ if (!error && xfs_ioend_is_append(ioend)) + error = xfs_setfilesize(ip, ioend->io_offset, ioend->io_size); done: - if (ioend->io_private) - error = xfs_setfilesize_ioend(ioend, error); iomap_finish_ioends(ioend, error); memalloc_nofs_restore(nofs_flag); } @@ -237,7 +208,7 @@ xfs_end_io(
static inline bool xfs_ioend_needs_workqueue(struct iomap_ioend *ioend) { - return ioend->io_private || + return xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN || (ioend->io_flags & IOMAP_F_SHARED); } @@ -250,8 +221,6 @@ xfs_end_bio( struct xfs_inode *ip = XFS_I(ioend->io_inode); unsigned long flags;
- ASSERT(xfs_ioend_needs_workqueue(ioend)); - spin_lock_irqsave(&ip->i_ioend_lock, flags); if (list_empty(&ip->i_ioend_list)) WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue, @@ -501,14 +470,6 @@ xfs_prepare_ioend( ioend->io_offset, ioend->io_size); }
- /* Reserve log space if we might write beyond the on-disk inode size. */ - if (!status && - ((ioend->io_flags & IOMAP_F_SHARED) || - ioend->io_type != IOMAP_UNWRITTEN) && - xfs_ioend_is_append(ioend) && - !ioend->io_private) - status = xfs_setfilesize_trans_alloc(ioend); - memalloc_nofs_restore(nofs_flag);
if (xfs_ioend_needs_workqueue(ioend))
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.12-rc4 commit 1aec7c3d05670b92b7339b19999009a93808efb9 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
In commit f8f2835a9cf3 we changed the behavior of XFS to use EFIs to remove blocks from an overfilled AGFL because there were complaints about transaction overruns that stemmed from trying to free multiple blocks in a single transaction.
Unfortunately, that commit missed a subtlety in the debug-mode transaction accounting when a realtime volume is attached. If a realtime file undergoes a data fork mapping change such that realtime extents are allocated (or freed) in the same transaction that a data device block is also allocated (or freed), we can trip a debugging assertion. This can happen (for example) if a realtime extent is allocated and it is necessary to reshape the bmbt to hold the new mapping.
When we go to allocate a bmbt block from an AG, the first thing the data device block allocator does is ensure that the freelist is the proper length. If the freelist is too long, it will trim the freelist to the proper length.
In debug mode, trimming the freelist calls xfs_trans_agflist_delta() to record the decrement in the AG free list count. Prior to f8f28 we would put the free block back in the free space btrees in the same transaction, which calls xfs_trans_agblocks_delta() to record the increment in the AG free block count. Since AGFL blocks are included in the global free block count (fdblocks), there is no corresponding fdblocks update, so the AGFL free satisfies the following condition in xfs_trans_apply_sb_deltas:
/* * Check that superblock mods match the mods made to AGF counters. */ ASSERT((tp->t_fdblocks_delta + tp->t_res_fdblocks_delta) == (tp->t_ag_freeblks_delta + tp->t_ag_flist_delta + tp->t_ag_btree_delta));
The comparison here used to be: (X + 0) == ((X+1) + -1 + 0), where X is the number blocks that were allocated.
After commit f8f28 we defer the block freeing to the next chained transaction, which means that the calls to xfs_trans_agflist_delta and xfs_trans_agblocks_delta occur in separate transactions. The (first) transaction that shortens the free list trips on the comparison, which has now become:
(X + 0) == ((X) + -1 + 0)
because we haven't freed the AGFL block yet; we've only logged an intention to free it. When the second transaction (the deferred free) commits, it will evaluate the expression as:
(0 + 0) == (1 + 0 + 0)
and trip over that in turn.
At this point, the astute reader may note that the two commits tagged by this patch have been in the kernel for a long time but haven't generated any bug reports. How is it that the author became aware of this bug?
This originally surfaced as an intermittent failure when I was testing realtime rmap, but a different bug report by Zorro Lang reveals the same assertion occuring on !lazysbcount filesystems.
The common factor to both reports (and why this problem wasn't previously reported) becomes apparent if we consider when xfs_trans_apply_sb_deltas is called by __xfs_trans_commit():
if (tp->t_flags & XFS_TRANS_SB_DIRTY) xfs_trans_apply_sb_deltas(tp);
With a modern lazysbcount filesystem, transactions update only the percpu counters, so they don't need to set XFS_TRANS_SB_DIRTY, hence xfs_trans_apply_sb_deltas is rarely called.
However, updates to the count of free realtime extents are not part of lazysbcount, so XFS_TRANS_SB_DIRTY will be set on transactions adding or removing data fork mappings to realtime files; similarly, XFS_TRANS_SB_DIRTY is always set on !lazysbcount filesystems.
Dave mentioned in response to an earlier version of this patch:
"IIUC, what you are saying is that this debug code is simply not exercised in normal testing and hasn't been for the past decade? And it still won't be exercised on anything other than realtime device testing?
"...it was debugging code from 1994 that was largely turned into dead code when lazysbcounters were introduced in 2007. Hence I'm not sure it holds any value anymore."
This debugging code isn't especially helpful - you can modify the flcount on one AG and the freeblks of another AG, and it won't trigger. Add the fact that nobody noticed for a decade, and let's just get rid of it (and start testing realtime :P).
This bug was found by running generic/051 on either a V4 filesystem lacking lazysbcount; or a V5 filesystem with a realtime volume.
Cc: bfoster@redhat.com, zlang@redhat.com Fixes: f8f2835a9cf3 ("xfs: defer agfl block frees when dfops is available") Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/libxfs/xfs_alloc.c | 3 --- fs/xfs/libxfs/xfs_alloc_btree.c | 2 -- fs/xfs/libxfs/xfs_rmap_btree.c | 2 -- fs/xfs/xfs_fsops.c | 2 -- fs/xfs/xfs_trans.c | 7 ------- fs/xfs/xfs_trans.h | 15 --------------- 6 files changed, 31 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 15640015be9d..6989efd55b5b 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -718,7 +718,6 @@ xfs_alloc_update_counters( agbp->b_pag->pagf_freeblks += len; be32_add_cpu(&agf->agf_freeblks, len);
- xfs_trans_agblocks_delta(tp, len); if (unlikely(be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length))) { xfs_buf_mark_corrupt(agbp); @@ -2689,7 +2688,6 @@ xfs_alloc_get_freelist( pag = agbp->b_pag; ASSERT(!pag->pagf_agflreset); be32_add_cpu(&agf->agf_flcount, -1); - xfs_trans_agflist_delta(tp, -1); pag->pagf_flcount--;
logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT; @@ -2796,7 +2794,6 @@ xfs_alloc_put_freelist( pag = agbp->b_pag; ASSERT(!pag->pagf_agflreset); be32_add_cpu(&agf->agf_flcount, 1); - xfs_trans_agflist_delta(tp, 1); pag->pagf_flcount++;
logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT; diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 8e01231b308e..dbe302d1cb8d 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -73,7 +73,6 @@ xfs_allocbt_alloc_block(
xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agno, bno, 1, false);
- xfs_trans_agbtree_delta(cur->bc_tp, 1); new->s = cpu_to_be32(bno);
*stat = 1; @@ -97,7 +96,6 @@ xfs_allocbt_free_block(
xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); - xfs_trans_agbtree_delta(cur->bc_tp, -1); return 0; }
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index beb81c84a937..9f5bcbd834c3 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -103,7 +103,6 @@ xfs_rmapbt_alloc_block( xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agno, bno, 1, false);
- xfs_trans_agbtree_delta(cur->bc_tp, 1); new->s = cpu_to_be32(bno); be32_add_cpu(&agf->agf_rmap_blocks, 1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); @@ -136,7 +135,6 @@ xfs_rmapbt_free_block(
xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); - xfs_trans_agbtree_delta(cur->bc_tp, -1);
pag = cur->bc_ag.agbp->b_pag; xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1); diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index ef1d5bb88b93..4a3a4f38531a 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -107,8 +107,6 @@ xfs_growfs_data_private( if (error) goto out_trans_cancel;
- xfs_trans_agblocks_delta(tp, id.nfree); - /* If there are new blocks in the old last AG, extend it. */ if (new) { error = xfs_ag_extend_space(mp, tp, &id, new); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index c94e71f741b6..7b9a394876bd 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -471,13 +471,6 @@ xfs_trans_apply_sb_deltas( bp = xfs_trans_getsb(tp); sbp = bp->b_addr;
- /* - * Check that superblock mods match the mods made to AGF counters. - */ - ASSERT((tp->t_fdblocks_delta + tp->t_res_fdblocks_delta) == - (tp->t_ag_freeblks_delta + tp->t_ag_flist_delta + - tp->t_ag_btree_delta)); - /* * Only update the superblock counters if we are logging them */ diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 084658946cc8..2f802d6c9fcf 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -140,11 +140,6 @@ typedef struct xfs_trans { int64_t t_res_fdblocks_delta; /* on-disk only chg */ int64_t t_frextents_delta;/* superblock freextents chg*/ int64_t t_res_frextents_delta; /* on-disk only chg */ -#if defined(DEBUG) || defined(XFS_WARN) - int64_t t_ag_freeblks_delta; /* debugging counter */ - int64_t t_ag_flist_delta; /* debugging counter */ - int64_t t_ag_btree_delta; /* debugging counter */ -#endif int64_t t_dblocks_delta;/* superblock dblocks change */ int64_t t_agcount_delta;/* superblock agcount change */ int64_t t_imaxpct_delta;/* superblock imaxpct change */ @@ -165,16 +160,6 @@ typedef struct xfs_trans { */ #define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC)
-#if defined(DEBUG) || defined(XFS_WARN) -#define xfs_trans_agblocks_delta(tp, d) ((tp)->t_ag_freeblks_delta += (int64_t)d) -#define xfs_trans_agflist_delta(tp, d) ((tp)->t_ag_flist_delta += (int64_t)d) -#define xfs_trans_agbtree_delta(tp, d) ((tp)->t_ag_btree_delta += (int64_t)d) -#else -#define xfs_trans_agblocks_delta(tp, d) -#define xfs_trans_agflist_delta(tp, d) -#define xfs_trans_agbtree_delta(tp, d) -#endif - /* * XFS transaction mechanism exported interfaces. */
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.12-rc4 commit e6c01077ec2d28fe8b6e0bc79eddea8d788f6ea3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
The AGF free space btree block counter wasn't added until the lazysbcount feature was added to XFS midway through the life of the V4 format, so ignore the field when checking. Online AGF repair requires rmapbt, so it doesn't need the feature check.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/scrub/agheader.c | 7 ++++++- fs/xfs/scrub/fscounters.c | 3 ++- 2 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index ae8e2e0ac64a..de42d667108a 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -416,6 +416,10 @@ xchk_agf_xref_btreeblks( xfs_agblock_t btreeblks; int error;
+ /* agf_btreeblks didn't exist before lazysbcount */ + if (!xfs_sb_version_haslazysbcount(&sc->mp->m_sb)) + return; + /* Check agf_rmap_blocks; set up for agf_btreeblks check */ if (sc->sa.rmap_cur) { error = xfs_btree_count_blocks(sc->sa.rmap_cur, &blocks); @@ -586,7 +590,8 @@ xchk_agf( xchk_block_set_corrupt(sc, sc->sa.agf_bp); if (pag->pagf_flcount != be32_to_cpu(agf->agf_flcount)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); - if (pag->pagf_btreeblks != be32_to_cpu(agf->agf_btreeblks)) + if (xfs_sb_version_haslazysbcount(&sc->mp->m_sb) && + pag->pagf_btreeblks != be32_to_cpu(agf->agf_btreeblks)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); xfs_perag_put(pag);
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index ec2064ed3c30..ccfd8534ebf3 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -183,7 +183,8 @@ xchk_fscount_aggregate_agcounts( /* Add up the free/freelist/bnobt/cntbt blocks */ fsc->fdblocks += pag->pagf_freeblks; fsc->fdblocks += pag->pagf_flcount; - fsc->fdblocks += pag->pagf_btreeblks; + if (xfs_sb_version_haslazysbcount(&sc->mp->m_sb)) + fsc->fdblocks += pag->pagf_btreeblks;
/* * Per-AG reservations are taken out of the incore counters,
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.12-rc4 commit 6543990a168acf366f4b6174d7bd46ba15a8a2a6 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Keep the mount superblock counters up to date for !lazysbcount filesystems so that when we log the superblock they do not need updating in any way because they are already correct.
It's found by what Zorro reported: 1. mkfs.xfs -f -l lazy-count=0 -m crc=0 $dev 2. mount $dev $mnt 3. fsstress -d $mnt -p 100 -n 1000 (maybe need more or less io load) 4. umount $mnt 5. xfs_repair -n $dev and I've seen no problem with this patch.
Signed-off-by: Dave Chinner dchinner@redhat.com Reported-by: Zorro Lang zlang@redhat.com Reviewed-by: Gao Xiang hsiangkao@redhat.com Signed-off-by: Gao Xiang hsiangkao@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/libxfs/xfs_sb.c | 16 +++++++++++++--- fs/xfs/xfs_trans.c | 3 +++ 2 files changed, 16 insertions(+), 3 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 5aeafa59ed27..66e8353da2f3 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -956,9 +956,19 @@ xfs_log_sb( struct xfs_mount *mp = tp->t_mountp; struct xfs_buf *bp = xfs_trans_getsb(tp);
- mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); - mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree); - mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks); + /* + * Lazy sb counters don't update the in-core superblock so do that now. + * If this is at unmount, the counters will be exactly correct, but at + * any other time they will only be ballpark correct because of + * reservations that have been taken out percpu counters. If we have an + * unclean shutdown, this will be corrected by log recovery rebuilding + * the counters from the AGF block counts. + */ + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) { + mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); + mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree); + mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks); + }
xfs_sb_to_disk(bp->b_addr, &mp->m_sb); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 7b9a394876bd..f41b16ad5b24 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -613,6 +613,9 @@ xfs_trans_unreserve_and_mod_sb(
/* apply remaining deltas */ spin_lock(&mp->m_sb_lock); + mp->m_sb.sb_fdblocks += tp->t_fdblocks_delta + tp->t_res_fdblocks_delta; + mp->m_sb.sb_icount += idelta; + mp->m_sb.sb_ifree += ifreedelta; mp->m_sb.sb_frextents += rtxdelta; mp->m_sb.sb_dblocks += tp->t_dblocks_delta; mp->m_sb.sb_agcount += tp->t_agcount_delta;
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.12-rc4 commit e147a756ab263f9d10eafd08b79b9fac1b08e56c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Since agf_btreeblks didn't exist before the lazysbcount feature, the fs summary count scrubber needs to walk the free space btrees to determine the amount of space being used by those btrees.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Gao Xiang hsiangkao@redhat.com Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/scrub/fscounters.c | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index ccfd8534ebf3..e6389bcf20cf 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -13,6 +13,7 @@ #include "xfs_alloc.h" #include "xfs_ialloc.h" #include "xfs_health.h" +#include "xfs_btree.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -144,6 +145,35 @@ xchk_setup_fscounters( return xchk_trans_alloc(sc, 0); }
+/* Count free space btree blocks manually for pre-lazysbcount filesystems. */ +static int +xchk_fscount_btreeblks( + struct xfs_scrub *sc, + struct xchk_fscounters *fsc, + xfs_agnumber_t agno) +{ + xfs_extlen_t blocks; + int error; + + error = xchk_ag_init(sc, agno, &sc->sa); + if (error) + return error; + + error = xfs_btree_count_blocks(sc->sa.bno_cur, &blocks); + if (error) + goto out_free; + fsc->fdblocks += blocks - 1; + + error = xfs_btree_count_blocks(sc->sa.cnt_cur, &blocks); + if (error) + goto out_free; + fsc->fdblocks += blocks - 1; + +out_free: + xchk_ag_free(sc, &sc->sa); + return error; +} + /* * Calculate what the global in-core counters ought to be from the incore * per-AG structure. Callers can compare this to the actual in-core counters @@ -183,8 +213,15 @@ xchk_fscount_aggregate_agcounts( /* Add up the free/freelist/bnobt/cntbt blocks */ fsc->fdblocks += pag->pagf_freeblks; fsc->fdblocks += pag->pagf_flcount; - if (xfs_sb_version_haslazysbcount(&sc->mp->m_sb)) + if (xfs_sb_version_haslazysbcount(&sc->mp->m_sb)) { fsc->fdblocks += pag->pagf_btreeblks; + } else { + error = xchk_fscount_btreeblks(sc, fsc, agno); + if (error) { + xfs_perag_put(pag); + break; + } + }
/* * Per-AG reservations are taken out of the incore counters,
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.12-rc4 commit d4f74e162d238ce00a640af5f0611c3f51dad70e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
The final parameter of filemap_write_and_wait_range is the end of the range to flush, not the length of the range to flush.
Fixes: 46afb0628b86 ("xfs: only flush the unshared range in xfs_reflink_unshare") Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Chandan Babu R chandanrlinux@gmail.com Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_reflink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 6fa05fb78189..aa46b75d75af 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1503,7 +1503,8 @@ xfs_reflink_unshare( if (error) goto out;
- error = filemap_write_and_wait_range(inode->i_mapping, offset, len); + error = filemap_write_and_wait_range(inode->i_mapping, offset, + offset + len - 1); if (error) goto out;
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.12-rc4 commit 71bddbccab436a261a22afe5d90de269941d0fe7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
While running a new fstest that races a readonly remount with scrub running in repair mode, I observed the kernel tripping over debugging assertions in the log quiesce code that were checking that the CIL was empty. When the sysadmin runs scrub in repair mode, the scrub code allocates real transactions (with reservations) to change things, but doesn't increment the superblock writers count to block a readonly remount attempt while it is running.
We don't require the userspace caller to have a writable file descriptor to run repairs, so we have to call mnt_want_write_file to obtain freeze protection and increment the writers count. It's ok to remove the call to sb_start_write for the dry-run case because commit 8321ddb2fa29 removed the behavior where scrub and fsfreeze fight over the buffer LRU.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Chandan Babu R chandanrlinux@gmail.com Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/scrub/scrub.c | 32 ++++++++++++++++++++------------ fs/xfs/scrub/scrub.h | 11 +++++++++++ fs/xfs/scrub/xfs_scrub.h | 4 ++-- fs/xfs/xfs_ioctl.c | 6 +++--- 4 files changed, 36 insertions(+), 17 deletions(-)
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 8ebf35b115ce..5d85fd19e77e 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -22,6 +22,7 @@ #include "scrub/trace.h" #include "scrub/repair.h" #include "scrub/health.h" +#include <linux/mount.h>
/* * Online Scrub and Repair @@ -149,9 +150,10 @@ xchk_probe( STATIC int xchk_teardown( struct xfs_scrub *sc, - struct xfs_inode *ip_in, int error) { + struct xfs_inode *ip_in = XFS_I(file_inode(sc->file)); + xchk_ag_free(sc, &sc->sa); if (sc->tp) { if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) @@ -168,7 +170,8 @@ xchk_teardown( xfs_irele(sc->ip); sc->ip = NULL; } - sb_end_write(sc->mp->m_super); + if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) + mnt_drop_write_file(sc->file); if (sc->flags & XCHK_REAPING_DISABLED) xchk_start_reaping(sc); if (sc->flags & XCHK_HAS_QUOTAOFFLOCK) { @@ -456,19 +459,22 @@ static inline void xchk_postmortem(struct xfs_scrub *sc) /* Dispatch metadata scrubbing. */ int xfs_scrub_metadata( - struct xfs_inode *ip, + struct file *file, struct xfs_scrub_metadata *sm) { struct xfs_scrub sc = { - .mp = ip->i_mount, + .file = file, .sm = sm, .sa = { .agno = NULLAGNUMBER, }, }; + struct xfs_inode *ip = XFS_I(file_inode(file)); struct xfs_mount *mp = ip->i_mount; int error = 0;
+ sc.mp = mp; + BUILD_BUG_ON(sizeof(meta_scrub_ops) != (sizeof(struct xchk_meta_ops) * XFS_SCRUB_TYPE_NR));
@@ -492,12 +498,14 @@ xfs_scrub_metadata( sc.sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type); retry_op: /* - * If freeze runs concurrently with a scrub, the freeze can be delayed - * indefinitely as we walk the filesystem and iterate over metadata - * buffers. Freeze quiesces the log (which waits for the buffer LRU to - * be emptied) and that won't happen while checking is running. + * When repairs are allowed, prevent freezing or readonly remount while + * scrub is running with a real transaction. */ - sb_start_write(mp->m_super); + if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) { + error = mnt_want_write_file(sc.file); + if (error) + goto out; + }
/* Set up for the operation. */ error = sc.ops->setup(&sc, ip); @@ -512,7 +520,7 @@ xfs_scrub_metadata( * Tear down everything we hold, then set up again with * preparation for worst-case scenarios. */ - error = xchk_teardown(&sc, ip, 0); + error = xchk_teardown(&sc, 0); if (error) goto out; sc.flags |= XCHK_TRY_HARDER; @@ -553,7 +561,7 @@ xfs_scrub_metadata( * get all the resources it needs; either way, we go * back to the beginning and call the scrub function. */ - error = xchk_teardown(&sc, ip, 0); + error = xchk_teardown(&sc, 0); if (error) { xrep_failure(mp); goto out; @@ -565,7 +573,7 @@ xfs_scrub_metadata( out_nofix: xchk_postmortem(&sc); out_teardown: - error = xchk_teardown(&sc, ip, error); + error = xchk_teardown(&sc, error); out: trace_xchk_done(ip, sm, error); if (error == -EFSCORRUPTED || error == -EFSBADCRC) { diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index ad1ceb44a628..e776ab4ad322 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -59,7 +59,18 @@ struct xfs_scrub { struct xfs_scrub_metadata *sm; const struct xchk_meta_ops *ops; struct xfs_trans *tp; + + /* File that scrub was called with. */ + struct file *file; + + /* + * File that is undergoing the scrub operation. This can differ from + * the file that scrub was called with if we're checking file-based fs + * metadata (e.g. rt bitmaps) or if we're doing a scrub-by-handle for + * something that can't be opened directly (e.g. symlinks). + */ struct xfs_inode *ip; + void *buf; uint ilock_flags;
diff --git a/fs/xfs/scrub/xfs_scrub.h b/fs/xfs/scrub/xfs_scrub.h index 2897ba3a17e6..2ceae614ade8 100644 --- a/fs/xfs/scrub/xfs_scrub.h +++ b/fs/xfs/scrub/xfs_scrub.h @@ -7,9 +7,9 @@ #define __XFS_SCRUB_H__
#ifndef CONFIG_XFS_ONLINE_SCRUB -# define xfs_scrub_metadata(ip, sm) (-ENOTTY) +# define xfs_scrub_metadata(file, sm) (-ENOTTY) #else -int xfs_scrub_metadata(struct xfs_inode *ip, struct xfs_scrub_metadata *sm); +int xfs_scrub_metadata(struct file *file, struct xfs_scrub_metadata *sm); #endif /* CONFIG_XFS_ONLINE_SCRUB */
#endif /* __XFS_SCRUB_H__ */ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 3fbd98f61ea5..2a68819e4fe5 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1848,7 +1848,7 @@ xfs_ioc_getfsmap(
STATIC int xfs_ioc_scrub_metadata( - struct xfs_inode *ip, + struct file *file, void __user *arg) { struct xfs_scrub_metadata scrub; @@ -1860,7 +1860,7 @@ xfs_ioc_scrub_metadata( if (copy_from_user(&scrub, arg, sizeof(scrub))) return -EFAULT;
- error = xfs_scrub_metadata(ip, &scrub); + error = xfs_scrub_metadata(file, &scrub); if (error) return error;
@@ -2159,7 +2159,7 @@ xfs_file_ioctl( return xfs_ioc_getfsmap(ip, arg);
case XFS_IOC_SCRUB_METADATA: - return xfs_ioc_scrub_metadata(ip, arg); + return xfs_ioc_scrub_metadata(filp, arg);
case XFS_IOC_FD_TO_HANDLE: case XFS_IOC_PATH_TO_HANDLE:
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.13-rc2 commit 16c9de54dc868c121918f2ae91e46330f919049f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
sc->ip is the inode that's being scrubbed, which means that it's not set for scrub types that don't involve inodes. If one of those scrubbers (e.g. inode btrees) returns EDEADLOCK, we'll trip over the null pointer. Fix that by reporting either the file being examined or the file that was used to call scrub.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/scrub/common.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 18876056e5e0..c8daccf4a62a 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -74,7 +74,9 @@ __xchk_process_error( return true; case -EDEADLOCK: /* Used to restart an op with deadlock avoidance. */ - trace_xchk_deadlock_retry(sc->ip, sc->sm, *error); + trace_xchk_deadlock_retry( + sc->ip ? sc->ip : XFS_I(file_inode(sc->file)), + sc->sm, *error); break; case -EFSBADCRC: case -EFSCORRUPTED:
From: Brian Foster bfoster@redhat.com
mainline-inclusion from mainline-v5.12-rc4 commit 2675ad3890db93e58f2264d07c2d1f615ec5adf7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
perag reservation is enabled at mount time on a per AG basis. The upcoming change to set aside allocbt blocks from block reservation requires a populated allocbt counter as soon as possible after mount to be fully effective against large perag reservations. Therefore as a preparation step, initialize the pagf on all mounts where at least one reservation is active. Note that this already occurs to some degree on most default format filesystems as reservation requirement calculations already depend on the AGF or AGI, depending on the reservation type.
Signed-off-by: Brian Foster bfoster@redhat.com Reviewed-by: Chandan Babu R chandanrlinux@gmail.com Reviewed-by: Allison Henderson allison.henderson@oracle.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/libxfs/xfs_ag_resv.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index fdfe6dc0d307..0edf1aea1a37 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -249,7 +249,8 @@ xfs_ag_resv_init( xfs_agnumber_t agno = pag->pag_agno; xfs_extlen_t ask; xfs_extlen_t used; - int error = 0; + int error = 0, error2; + bool has_resv = false;
/* Create the metadata reservation. */ if (pag->pag_meta_resv.ar_asked == 0) { @@ -287,6 +288,8 @@ xfs_ag_resv_init( if (error) goto out; } + if (ask) + has_resv = true; }
/* Create the RMAPBT metadata reservation */ @@ -300,19 +303,28 @@ xfs_ag_resv_init( error = __xfs_ag_resv_init(pag, XFS_AG_RESV_RMAPBT, ask, used); if (error) goto out; + if (ask) + has_resv = true; }
-#ifdef DEBUG - /* need to read in the AGF for the ASSERT below to work */ - error = xfs_alloc_pagf_init(pag->pag_mount, tp, pag->pag_agno, 0); - if (error) - return error; - - ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved + - xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved <= - pag->pagf_freeblks + pag->pagf_flcount); -#endif out: + /* + * Initialize the pagf if we have at least one active reservation on the + * AG. This may have occurred already via reservation calculation, but + * fall back to an explicit init to ensure the in-core allocbt usage + * counters are initialized as soon as possible. This is important + * because filesystems with large perag reservations are susceptible to + * free space reservation problems that the allocbt counter is used to + * address. + */ + if (has_resv) { + error2 = xfs_alloc_pagf_init(mp, tp, pag->pag_agno, 0); + if (error2) + return error2; + ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved + + xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved <= + pag->pagf_freeblks + pag->pagf_flcount); + } return error; }
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.13-rc2 commit 0f9342513cc78a31a4a272a19b35eee4e8cd7107 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
The new online shrink code exposed a gap in the per-AG reservation code, which is that we only return ENOSPC to callers if the entire fs doesn't have enough free blocks. Except for debugging mode, the reservation init code doesn't ever check that there's enough free space in that AG to cover the reservation.
Not having enough space is not considered an immediate fatal error that requires filesystem offlining because (a) it's shouldn't be possible to wind up in that state through normal file operations and (b) even if one did, freeing data blocks would recover the situation.
However, online shrink now needs to know if shrinking would not leave enough space so that it can abort the shrink operation. Hence we need to promote this assertion into an actual error return.
Observed by running xfs/168 with a 1k block size, though in theory this could happen with any configuration.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Carlos Maiolino cmaiolino@redhat.com Reviewed-by: Gao Xiang hsiangkao@linux.alibaba.com Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/libxfs/xfs_ag_resv.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index 0edf1aea1a37..64b04a95fc54 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -321,10 +321,22 @@ xfs_ag_resv_init( error2 = xfs_alloc_pagf_init(mp, tp, pag->pag_agno, 0); if (error2) return error2; - ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved + - xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved <= - pag->pagf_freeblks + pag->pagf_flcount); + + /* + * If there isn't enough space in the AG to satisfy the + * reservation, let the caller know that there wasn't enough + * space. Callers are responsible for deciding what to do + * next, since (in theory) we can stumble along with + * insufficient reservation if data blocks are being freed to + * replenish the AG's free space. + */ + if (!error && + xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved + + xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved > + pag->pagf_freeblks + pag->pagf_flcount) + error = -ENOSPC; } + return error; }
From: Brian Foster bfoster@redhat.com
mainline-inclusion from mainline-v5.11-rc4 commit b0eb9e1182668b0e9cf81dbf38041cfb8c12887f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Now that log covering occurs on quiesce, we'd like to reuse the underlying superblock sync for final superblock updates. This includes things like lazy superblock counter updates, log feature incompat bits in the future, etc. One quirk to this approach is that once the log is in the IDLE (i.e. already covered) state, any subsequent log write resets the state back to NEED. This means that a final superblock sync to an already covered log requires two more sb syncs to return the log back to IDLE again.
For example, if a lazy superblock enabled filesystem is mount cycled without any modifications, the unmount path syncs the superblock once and writes an unmount record. With the desired log quiesce covering behavior, we sync the superblock three times at unmount time: once for the lazy superblock counter update and twice more to cover the log. By contrast, if the log is active or only partially covered at unmount time, a final superblock sync would doubly serve as the one or two remaining syncs required to cover the log.
This duplicate covering sequence is unnecessary because the filesystem remains consistent if a crash occurs at any point. The superblock will either be recovered in the event of a crash or written back before the log is quiesced and potentially cleaned with an unmount record.
Update the log covering state machine to remain in the IDLE state if additional covering checkpoints pass through the log. This facilitates final superblock updates (such as lazy superblock counters) via a single sb sync without losing covered status. This provides some consistency with the active and partially covered cases and also avoids harmless, but spurious checkpoints when quiescing the log.
Signed-off-by: Brian Foster bfoster@redhat.com Reviewed-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Allison Henderson allison.henderson@oracle.com Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_log.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index fa2d05e65ff1..b70b7d041d79 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2538,12 +2538,15 @@ xlog_covered_state( int iclogs_changed) { /* - * We usually go to NEED. But we go to NEED2 if the changed indicates we - * are done writing the dummy record. If we are done with the second - * dummy recored (DONE2), then we go to IDLE. + * We go to NEED for any non-covering writes. We go to NEED2 if we just + * wrote the first covering record (DONE). We go to IDLE if we just + * wrote the second covering record (DONE2) and remain in IDLE until a + * non-covering write occurs. */ switch (prev_state) { case XLOG_STATE_COVER_IDLE: + if (iclogs_changed == 1) + return XLOG_STATE_COVER_IDLE; case XLOG_STATE_COVER_NEED: case XLOG_STATE_COVER_NEED2: break;
From: "Gustavo A. R. Silva" gustavoars@kernel.org
mainline-inclusion from mainline-v5.13-rc2 commit 53004ee78d6273c994534ccf79d993098ac89769 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
In preparation to enable -Wimplicit-fallthrough for Clang, fix the following warnings by replacing /* fall through */ comments, and its variants, with the new pseudo-keyword macro fallthrough:
fs/xfs/libxfs/xfs_alloc.c:3167:2: warning: unannotated fall-through between switch labels [-Wimplicit-fallthrough] fs/xfs/libxfs/xfs_da_btree.c:286:3: warning: unannotated fall-through between switch labels [-Wimplicit-fallthrough] fs/xfs/libxfs/xfs_ag_resv.c:346:2: warning: unannotated fall-through between switch labels [-Wimplicit-fallthrough] fs/xfs/libxfs/xfs_ag_resv.c:388:2: warning: unannotated fall-through between switch labels [-Wimplicit-fallthrough] fs/xfs/xfs_bmap_util.c:246:2: warning: unannotated fall-through between switch labels [-Wimplicit-fallthrough] fs/xfs/xfs_export.c:88:2: warning: unannotated fall-through between switch labels [-Wimplicit-fallthrough] fs/xfs/xfs_export.c:96:2: warning: unannotated fall-through between switch labels [-Wimplicit-fallthrough] fs/xfs/xfs_file.c:867:3: warning: unannotated fall-through between switch labels [-Wimplicit-fallthrough] fs/xfs/xfs_ioctl.c:562:2: warning: unannotated fall-through between switch labels [-Wimplicit-fallthrough] fs/xfs/xfs_ioctl.c:1548:2: warning: unannotated fall-through between switch labels [-Wimplicit-fallthrough] fs/xfs/xfs_iomap.c:1040:2: warning: unannotated fall-through between switch labels [-Wimplicit-fallthrough] fs/xfs/xfs_inode.c:852:2: warning: unannotated fall-through between switch labels [-Wimplicit-fallthrough] fs/xfs/xfs_log.c:2627:2: warning: unannotated fall-through between switch labels [-Wimplicit-fallthrough] fs/xfs/xfs_trans_buf.c:298:2: warning: unannotated fall-through between switch labels [-Wimplicit-fallthrough] fs/xfs/scrub/bmap.c:275:2: warning: unannotated fall-through between switch labels [-Wimplicit-fallthrough] fs/xfs/scrub/btree.c:48:2: warning: unannotated fall-through between switch labels [-Wimplicit-fallthrough] fs/xfs/scrub/common.c:85:2: warning: unannotated fall-through between switch labels [-Wimplicit-fallthrough] fs/xfs/scrub/common.c:138:2: warning: unannotated fall-through between switch labels [-Wimplicit-fallthrough] fs/xfs/scrub/common.c:698:2: warning: unannotated fall-through between switch labels [-Wimplicit-fallthrough] fs/xfs/scrub/dabtree.c:51:2: warning: unannotated fall-through between switch labels [-Wimplicit-fallthrough] fs/xfs/scrub/repair.c:951:2: warning: unannotated fall-through between switch labels [-Wimplicit-fallthrough] fs/xfs/scrub/agheader.c:89:2: warning: unannotated fall-through between switch labels [-Wimplicit-fallthrough]
Notice that Clang doesn't recognize /* fall through */ comments as implicit fall-through markings, so in order to globally enable -Wimplicit-fallthrough for Clang, these comments need to be replaced with fallthrough; in the whole codebase.
Link: https://github.com/KSPP/linux/issues/115 Signed-off-by: Gustavo A. R. Silva gustavoars@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/libxfs/xfs_ag_resv.c | 4 ++-- fs/xfs/libxfs/xfs_alloc.c | 2 +- fs/xfs/libxfs/xfs_da_btree.c | 2 +- fs/xfs/scrub/agheader.c | 1 + fs/xfs/scrub/bmap.c | 2 +- fs/xfs/scrub/btree.c | 2 +- fs/xfs/scrub/common.c | 6 +++--- fs/xfs/scrub/dabtree.c | 2 +- fs/xfs/scrub/repair.c | 2 +- fs/xfs/xfs_bmap_util.c | 2 +- fs/xfs/xfs_export.c | 4 ++-- fs/xfs/xfs_file.c | 2 +- fs/xfs/xfs_inode.c | 2 +- fs/xfs/xfs_ioctl.c | 4 ++-- fs/xfs/xfs_iomap.c | 2 +- fs/xfs/xfs_log.c | 1 + fs/xfs/xfs_trans_buf.c | 2 +- 17 files changed, 22 insertions(+), 20 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index 64b04a95fc54..f38034194bd7 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -362,7 +362,7 @@ xfs_ag_resv_alloc_extent( break; default: ASSERT(0); - /* fall through */ + fallthrough; case XFS_AG_RESV_NONE: field = args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS : XFS_TRANS_SB_FDBLOCKS; @@ -404,7 +404,7 @@ xfs_ag_resv_free_extent( break; default: ASSERT(0); - /* fall through */ + fallthrough; case XFS_AG_RESV_NONE: xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len); return; diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 6989efd55b5b..6f54f639913f 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -3110,7 +3110,7 @@ xfs_alloc_vextent( } args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); args->type = XFS_ALLOCTYPE_NEAR_BNO; - /* FALLTHROUGH */ + fallthrough; case XFS_ALLOCTYPE_FIRST_AG: /* * Rotate through the allocation groups looking for a winner. diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index e46bc03365db..5a13bfe0ebbf 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -282,7 +282,7 @@ xfs_da3_node_read_verify( __this_address); break; } - /* fall through */ + fallthrough; case XFS_DA_NODE_MAGIC: fa = xfs_da3_node_verify(bp); if (fa) diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index de42d667108a..16895b2f0bf9 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -86,6 +86,7 @@ xchk_superblock( case -ENOSYS: case -EFBIG: error = -EFSCORRUPTED; + fallthrough; default: break; } diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index fed56d213a3f..e68d5d6a6e6b 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -272,7 +272,7 @@ xchk_bmap_iextent_xref( case XFS_DATA_FORK: if (xfs_is_reflink_inode(info->sc->ip)) break; - /* fall through */ + fallthrough; case XFS_ATTR_FORK: xchk_xref_is_not_shared(info->sc, agbno, irec->br_blockcount); diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index debf392e0515..8c81761e52ab 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -43,7 +43,7 @@ __xchk_btree_process_error( /* Note the badness but don't abort. */ sc->sm->sm_flags |= errflag; *error = 0; - /* fall through */ + fallthrough; default: if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) trace_xchk_ifork_btree_op_error(sc, cur, level, diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index c8daccf4a62a..54b5e9a61455 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -83,7 +83,7 @@ __xchk_process_error( /* Note the badness but don't abort. */ sc->sm->sm_flags |= errflag; *error = 0; - /* fall through */ + fallthrough; default: trace_xchk_op_error(sc, agno, bno, *error, ret_ip); @@ -136,7 +136,7 @@ __xchk_fblock_process_error( /* Note the badness but don't abort. */ sc->sm->sm_flags |= errflag; *error = 0; - /* fall through */ + fallthrough; default: trace_xchk_file_op_error(sc, whichfork, offset, *error, ret_ip); @@ -715,7 +715,7 @@ xchk_get_inode( if (error) return -ENOENT; error = -EFSCORRUPTED; - /* fall through */ + fallthrough; default: trace_xchk_op_error(sc, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino), diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index 653f3280e1c1..9f0dbb47c82c 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -47,7 +47,7 @@ xchk_da_process_error( /* Note the badness but don't abort. */ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; *error = 0; - /* fall through */ + fallthrough; default: trace_xchk_file_op_error(sc, ds->dargs.whichfork, xfs_dir2_da_to_db(ds->dargs.geo, diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 25e86c71e7b9..c814b18efba9 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -944,7 +944,7 @@ xrep_ino_dqattach( xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP); if (XFS_IS_PQUOTA_ON(sc->mp) && !sc->ip->i_pdquot) xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ); - /* fall through */ + fallthrough; case -ESRCH: error = 0; break; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 7371a7f7c652..32180dd5ee7d 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -244,7 +244,7 @@ xfs_bmap_count_blocks( */ *count += btblocks - 1;
- /* fall through */ + fallthrough; case XFS_DINODE_FMT_EXTENTS: *nextents = xfs_bmap_count_leaves(ifp, count); break; diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 465fd9e048d4..1da59bdff245 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -84,7 +84,7 @@ xfs_fs_encode_fh( case FILEID_INO32_GEN_PARENT: fid->i32.parent_ino = XFS_I(parent)->i_ino; fid->i32.parent_gen = parent->i_generation; - /*FALLTHRU*/ + fallthrough; case FILEID_INO32_GEN: fid->i32.ino = XFS_I(inode)->i_ino; fid->i32.gen = inode->i_generation; @@ -92,7 +92,7 @@ xfs_fs_encode_fh( case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: fid64->parent_ino = XFS_I(parent)->i_ino; fid64->parent_gen = parent->i_generation; - /*FALLTHRU*/ + fallthrough; case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG: fid64->ino = XFS_I(inode)->i_ino; fid64->gen = inode->i_generation; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 5b0f93f73837..aeb09b059a23 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -805,7 +805,7 @@ xfs_break_layouts( error = xfs_break_dax_layouts(inode, &retry); if (error || retry) break; - /* fall through */ + fallthrough; case BREAK_WRITE: error = xfs_break_leased_layouts(inode, iolock, &retry); break; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 2bfbcf28b1bd..b4557a827666 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -907,7 +907,7 @@ xfs_ialloc( xfs_inode_inherit_flags(ip, pip); if (pip && (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY)) xfs_inode_inherit_flags2(ip, pip); - /* FALLTHROUGH */ + fallthrough; case S_IFLNK: ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; ip->i_df.if_flags = XFS_IFEXTENTS; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 2a68819e4fe5..f6bb581edc6c 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -557,7 +557,7 @@ xfs_ioc_attrmulti_one( case ATTR_OP_REMOVE: value = NULL; *len = 0; - /* fall through */ + fallthrough; case ATTR_OP_SET: error = mnt_want_write_file(parfilp); if (error) @@ -1669,7 +1669,7 @@ xfs_ioc_getbmap( switch (cmd) { case XFS_IOC_GETBMAPA: bmx.bmv_iflags = BMV_IF_ATTRFORK; - /*FALLTHRU*/ + fallthrough; case XFS_IOC_GETBMAP: if (file->f_mode & FMODE_NOCMTIME) bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 7b9ff824e82d..9d701ce2f703 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1031,7 +1031,7 @@ xfs_buffered_write_iomap_begin( prealloc_blocks = 0; goto retry; } - /*FALLTHRU*/ + fallthrough; default: goto out_unlock; } diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index b70b7d041d79..82002eee667d 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2547,6 +2547,7 @@ xlog_covered_state( case XLOG_STATE_COVER_IDLE: if (iclogs_changed == 1) return XLOG_STATE_COVER_IDLE; + fallthrough; case XLOG_STATE_COVER_NEED: case XLOG_STATE_COVER_NEED2: break; diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 42d63b830cb9..342817e15ada 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -294,7 +294,7 @@ xfs_trans_read_buf_map( default: if (tp && (tp->t_flags & XFS_TRANS_DIRTY)) xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); - /* fall through */ + fallthrough; case -ENOMEM: case -EAGAIN: return error;
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.13-rc2 commit 991c2c5980fb97ae6194f7c46b44f9446629eb4e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
xfs/538 is assert failing with this trace when testing with directory block sizes of 64kB:
XFS: Assertion failed: !xfs_need_iread_extents(ifp), file: fs/xfs/libxfs/xfs_bmap.c, line: 608 .... Call Trace: xfs_bmap_btree_to_extents+0x2a9/0x470 ? kmem_cache_alloc+0xe7/0x220 __xfs_bunmapi+0x4ca/0xdf0 xfs_bunmapi+0x1a/0x30 xfs_dir2_shrink_inode+0x71/0x210 xfs_dir2_block_to_sf+0x2ae/0x410 xfs_dir2_block_removename+0x21a/0x280 xfs_dir_removename+0x195/0x1d0 xfs_remove+0x244/0x460 xfs_vn_unlink+0x53/0xa0 ? selinux_inode_unlink+0x13/0x20 vfs_unlink+0x117/0x220 do_unlinkat+0x1a2/0x2d0 __x64_sys_unlink+0x42/0x60 do_syscall_64+0x3a/0x70 entry_SYSCALL_64_after_hwframe+0x44/0xae
This is a check to ensure that the extents have been read into memory before we are doing a ifork btree manipulation. This assert is bogus in the above case.
We have a fragmented directory block that has more extents in it than can fit in extent format, so the inode data fork is in btree format. xfs_dir2_shrink_inode() asks to remove all remaining 16 filesystem blocks from the inode so it can convert to short form, and __xfs_bunmapi() removes all the extents. We now have a data fork in btree format but have zero extents in the fork. This incorrectly trips the xfs_need_iread_extents() assert because it assumes that an empty extent btree means the extent tree has not been read into memory yet. This is clearly not the case with xfs_bunmapi(), as it has an explicit call to xfs_iread_extents() in it to pull the extents into memory before it starts unmapping.
Also, the assert directly after this bogus one is:
ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE);
Which covers the context in which it is legal to call xfs_bmap_btree_to_extents just fine. Hence we should just remove the bogus assert as it is clearly wrong and causes a regression.
The returns the test behaviour to the pre-existing assert failure in xfs_dir2_shrink_inode() that indicates xfs_bunmapi() has failed to remove all the extents in the range it was asked to unmap.
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/libxfs/xfs_bmap.c | 1 - 1 file changed, 1 deletion(-)
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index d9a692484eae..42e0a7ee7d2b 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -603,7 +603,6 @@ xfs_bmap_btree_to_extents(
ASSERT(cur); ASSERT(whichfork != XFS_COW_FORK); - ASSERT(ifp->if_flags & XFS_IFEXTENTS); ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE); ASSERT(be16_to_cpu(rblock->bb_level) == 1); ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.13-rc2 commit 0fe0bbe00a6fb77adf75085b7d06b71a830dd6f2 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
large directory block size operations are assert failing because xfs_bunmapi() is not completely removing fragmented directory blocks like so:
XFS: Assertion failed: done, file: fs/xfs/libxfs/xfs_dir2.c, line: 677 .... Call Trace: xfs_dir2_shrink_inode+0x1a8/0x210 xfs_dir2_block_to_sf+0x2ae/0x410 xfs_dir2_block_removename+0x21a/0x280 xfs_dir_removename+0x195/0x1d0 xfs_rename+0xb79/0xc50 ? avc_has_perm+0x8d/0x1a0 ? avc_has_perm_noaudit+0x9a/0x120 xfs_vn_rename+0xdb/0x150 vfs_rename+0x719/0xb50 ? __lookup_hash+0x6a/0xa0 do_renameat2+0x413/0x5e0 __x64_sys_rename+0x45/0x50 do_syscall_64+0x3a/0x70 entry_SYSCALL_64_after_hwframe+0x44/0xae
We are aborting the bunmapi() pass because of this specific chunk of code:
/* * Make sure we don't touch multiple AGF headers out of order * in a single transaction, as that could cause AB-BA deadlocks. */ if (!wasdel && !isrt) { agno = XFS_FSB_TO_AGNO(mp, del.br_startblock); if (prev_agno != NULLAGNUMBER && prev_agno > agno) break; prev_agno = agno; }
This is designed to prevent deadlocks in AGF locking when freeing multiple extents by ensuring that we only ever lock in increasing AG number order. Unfortunately, this also violates the "bunmapi will always succeed" semantic that some high level callers depend on, such as xfs_dir2_shrink_inode(), xfs_da_shrink_inode() and xfs_inactive_symlink_rmt().
This AG lock ordering was introduced back in 2017 to fix deadlocks triggered by generic/299 as reported here:
https://lore.kernel.org/linux-xfs/800468eb-3ded-9166-20a4-047de8018582@gmail...
This codebase is old enough that it was before we were defering all AG based extent freeing from within xfs_bunmapi(). THat is, we never actually lock AGs in xfs_bunmapi() any more - every non-rt based extent free is added to the defer ops list, as is all BMBT block freeing. And RT extents are not RT based, so there's no lock ordering issues associated with them.
Hence this AGF lock ordering code is both broken and dead. Let's just remove it so that the large directory block code works reliably again.
Tested against xfs/538 and generic/299 which is the original test that exposed the deadlocks that this code fixed.
Fixes: 5b094d6dac04 ("xfs: fix multi-AG deadlock in xfs_bunmapi") Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/libxfs/xfs_bmap.c | 11 ----------- 1 file changed, 11 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 42e0a7ee7d2b..069b68d09287 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5276,7 +5276,6 @@ __xfs_bunmapi( xfs_fsblock_t sum; xfs_filblks_t len = *rlen; /* length to unmap in file */ xfs_fileoff_t max_len; - xfs_agnumber_t prev_agno = NULLAGNUMBER, agno; xfs_fileoff_t end; struct xfs_iext_cursor icur; bool done = false; @@ -5367,16 +5366,6 @@ __xfs_bunmapi( del = got; wasdel = isnullstartblock(del.br_startblock);
- /* - * Make sure we don't touch multiple AGF headers out of order - * in a single transaction, as that could cause AB-BA deadlocks. - */ - if (!wasdel && !isrt) { - agno = XFS_FSB_TO_AGNO(mp, del.br_startblock); - if (prev_agno != NULLAGNUMBER && prev_agno > agno) - break; - prev_agno = agno; - } if (got.br_startoff < start) { del.br_startoff = start; del.br_blockcount -= start - got.br_startoff;
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.13-rc4 commit 977ec4ddf0b75b30afa443cf71ae80e20f501b15 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Because this happens at high thread counts on high IOPS devices doing mixed read/write AIO-DIO to a single file at about a million iops:
64.09% 0.21% [kernel] [k] io_submit_one - 63.87% io_submit_one - 44.33% aio_write - 42.70% xfs_file_write_iter - 41.32% xfs_file_dio_write_aligned - 25.51% xfs_file_write_checks - 21.60% _raw_spin_lock - 21.59% do_raw_spin_lock - 19.70% __pv_queued_spin_lock_slowpath
This also happens of the IO completion IO path:
22.89% 0.69% [kernel] [k] xfs_dio_write_end_io - 22.49% xfs_dio_write_end_io - 21.79% _raw_spin_lock - 20.97% do_raw_spin_lock - 20.10% __pv_queued_spin_lock_slowpath
IOWs, fio is burning ~14 whole CPUs on this spin lock.
So, do an unlocked check against inode size first, then if we are at/beyond EOF, take the spinlock and recheck. This makes the spinlock disappear from the overwrite fastpath.
I'd like to report that fixing this makes things go faster. It doesn't - it just exposes the the XFS_ILOCK as the next severe contention point doing extent mapping lookups, and that now burns all the 14 CPUs this spinlock was burning.
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Carlos Maiolino cmaiolino@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_file.c | 48 ++++++++++++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 17 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index aeb09b059a23..5fedb04b0cc3 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -342,21 +342,30 @@ xfs_file_aio_write_checks( xfs_ilock(ip, *iolock); goto restart; } + /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this - * write. If zeroing is needed and we are currently holding the - * iolock shared, we need to update it to exclusive which implies - * having to redo all checks before. + * write. If zeroing is needed and we are currently holding the iolock + * shared, we need to update it to exclusive which implies having to + * redo all checks before. + * + * We need to serialise against EOF updates that occur in IO completions + * here. We want to make sure that nobody is changing the size while we + * do this check until we have placed an IO barrier (i.e. hold the + * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The + * spinlock effectively forms a memory barrier once we have the + * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and + * hence be able to correctly determine if we need to run zeroing. * - * We need to serialise against EOF updates that occur in IO - * completions here. We want to make sure that nobody is changing the - * size while we do this check until we have placed an IO barrier (i.e. - * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. - * The spinlock effectively forms a memory barrier once we have the - * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value - * and hence be able to correctly determine if we need to run zeroing. + * We can do an unlocked check here safely as IO completion can only + * extend EOF. Truncate is locked out at this point, so the EOF can + * not move backwards, only forwards. Hence we only need to take the + * slow path and spin locks when we are at or beyond the current EOF. */ + if (iocb->ki_pos <= i_size_read(inode)) + goto out; + spin_lock(&ip->i_flags_lock); isize = i_size_read(inode); if (iocb->ki_pos > isize) { @@ -380,7 +389,7 @@ xfs_file_aio_write_checks( drained_dio = true; goto restart; } - + trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); error = iomap_zero_range(inode, isize, iocb->ki_pos - isize, NULL, &xfs_buffered_write_iomap_ops); @@ -389,12 +398,7 @@ xfs_file_aio_write_checks( } else spin_unlock(&ip->i_flags_lock);
- /* - * Updating the timestamps will grab the ilock again from - * xfs_fs_dirty_inode, so we have to call it after dropping the - * lock above. Eventually we should look into a way to avoid - * the pointless lock roundtrip. - */ +out: return file_modified(file); }
@@ -460,7 +464,17 @@ xfs_dio_write_end_io( * other IO completions here to update the EOF. Failing to serialise * here can result in EOF moving backwards and Bad Things Happen when * that occurs. + * + * As IO completion only ever extends EOF, we can do an unlocked check + * here to avoid taking the spinlock. If we land within the current EOF, + * then we do not need to do an extending update at all, and we don't + * need to take the lock to check this. If we race with an update moving + * EOF, then we'll either still be beyond EOF and need to take the lock, + * or we'll be within EOF and we don't need to take it at all. */ + if (offset + size <= i_size_read(inode)) + goto out; + spin_lock(&ip->i_flags_lock); if (offset + size > i_size_read(inode)) { i_size_write(inode, offset + size);
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.13-rc4 commit 255794c7ed7adb914e831f5e4905d783d31378d2 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
While running some fuzz tests on inode metadata, I noticed that the filesystem health report (as provided by xfs_spaceman) failed to report the file corruption even when spaceman was run immediately after running xfs_scrub to detect the corruption. That isn't the intended behavior; one ought to be able to run scrub to detect errors in the ondisk metadata and be able to access to those reports for some time after the scrub.
After running the same sequence through an instrumented kernel, I discovered the reason why -- scrub igets the file, scans it, marks it sick, and ireleases the inode. When the VFS lets go of the incore inode, it moves to RECLAIMABLE state. If spaceman igets the incore inode before it moves to RECLAIM state, iget reinitializes the VFS state, clears the sick and checked masks, and hands back the inode. At this point, the caller has the exact same incore inode, but with all the health state erased.
In other words, we're erasing the incore inode's health state flags when we've decided NOT to sever the link between the incore inode and the ondisk inode. This is wrong, so we need to remove the lines that zero the fields from xfs_iget_cache_hit.
As a precaution, we add the same lines into xfs_reclaim_inode just after we sever the link between incore and ondisk inode. Strictly speaking this isn't necessary because once an inode has gone through reclaim it must go through xfs_inode_alloc (which also zeroes the state) and xfs_iget is careful to check for mismatches between the inode it pulls out of the radix tree and the one it wants.
Fixes: 6772c1f11206 ("xfs: track metadata health status") Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Dave Chinner dchinner@redhat.com Reviewed-by: Carlos Maiolino cmaiolino@redhat.com Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index deb99300d171..ed1be02dc1c3 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -433,9 +433,6 @@ xfs_iget_cache_hit( ip->i_flags |= XFS_INEW; xfs_inode_clear_reclaim_tag(pag, ip->i_ino); inode->i_state = I_NEW; - ip->i_sick = 0; - ip->i_checked = 0; - spin_unlock(&ip->i_flags_lock); spin_unlock(&pag->pag_ici_lock); } else { @@ -1063,6 +1060,8 @@ xfs_reclaim_inode( spin_lock(&ip->i_flags_lock); ip->i_flags = XFS_IRECLAIM; ip->i_ino = 0; + ip->i_sick = 0; + ip->i_checked = 0; spin_unlock(&ip->i_flags_lock);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.13-rc4 commit a6a65fef5ef8d0a6a0ce514eb66b2f3dfa777b48 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
We don't need to look at the xfs_mount and superblock every time we need to do an iclog roundoff calculation. The property is fixed for the life of the log, so store the roundoff in the log at mount time and use that everywhere.
On a debug build:
$ size fs/xfs/xfs_log.o.* text data bss dec hex filename 27360 560 8 27928 6d18 fs/xfs/xfs_log.o.orig 27219 560 8 27787 6c8b fs/xfs/xfs_log.o.patched
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Chandan Babu R chandanrlinux@gmail.com Reviewed-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Allison Henderson allison.henderson@oracle.com Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/libxfs/xfs_log_format.h | 3 -- fs/xfs/xfs_log.c | 59 ++++++++++++++-------------------- fs/xfs/xfs_log_priv.h | 2 ++ 3 files changed, 27 insertions(+), 37 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 8bd00da6d2a4..16587219549c 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -34,9 +34,6 @@ typedef uint32_t xlog_tid_t; #define XLOG_MIN_RECORD_BSHIFT 14 /* 16384 == 1 << 14 */ #define XLOG_BIG_RECORD_BSHIFT 15 /* 32k == 1 << 15 */ #define XLOG_MAX_RECORD_BSHIFT 18 /* 256k == 1 << 18 */ -#define XLOG_BTOLSUNIT(log, b) (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \ - (log)->l_mp->m_sb.sb_logsunit) -#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit)
#define XLOG_HEADER_SIZE 512
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 82002eee667d..871922f4d3fe 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1323,6 +1323,11 @@ xlog_alloc_log( xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0); log->l_curr_cycle = 1; /* 0 is bad since this is initial value */
+ if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) + log->l_iclog_roundoff = mp->m_sb.sb_logsunit; + else + log->l_iclog_roundoff = BBSIZE; + xlog_grant_head_init(&log->l_reserve_head); xlog_grant_head_init(&log->l_write_head);
@@ -1775,29 +1780,15 @@ xlog_calc_iclog_size( uint32_t *roundoff) { uint32_t count_init, count; - bool use_lsunit; - - use_lsunit = xfs_sb_version_haslogv2(&log->l_mp->m_sb) && - log->l_mp->m_sb.sb_logsunit > 1;
/* Add for LR header */ count_init = log->l_iclog_hsize + iclog->ic_offset; + count = roundup(count_init, log->l_iclog_roundoff);
- /* Round out the log write size */ - if (use_lsunit) { - /* we have a v2 stripe unit to use */ - count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init)); - } else { - count = BBTOB(BTOBB(count_init)); - } - - ASSERT(count >= count_init); *roundoff = count - count_init;
- if (use_lsunit) - ASSERT(*roundoff < log->l_mp->m_sb.sb_logsunit); - else - ASSERT(*roundoff < BBTOB(1)); + ASSERT(count >= count_init); + ASSERT(*roundoff < log->l_iclog_roundoff); return count; }
@@ -3073,10 +3064,9 @@ xlog_state_switch_iclogs( log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize);
/* Round up to next log-sunit */ - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) && - log->l_mp->m_sb.sb_logsunit > 1) { - uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit); - log->l_curr_block = roundup(log->l_curr_block, sunit_bb); + if (log->l_iclog_roundoff > BBSIZE) { + log->l_curr_block = roundup(log->l_curr_block, + BTOBB(log->l_iclog_roundoff)); }
if (log->l_curr_block >= log->l_logBBsize) { @@ -3328,12 +3318,11 @@ xfs_log_ticket_get( * Figure out the total log space unit (in bytes) that would be * required for a log ticket. */ -int -xfs_log_calc_unit_res( - struct xfs_mount *mp, +static int +xlog_calc_unit_res( + struct xlog *log, int unit_bytes) { - struct xlog *log = mp->m_log; int iclog_space; uint num_headers;
@@ -3409,18 +3398,20 @@ xfs_log_calc_unit_res( /* for commit-rec LR header - note: padding will subsume the ophdr */ unit_bytes += log->l_iclog_hsize;
- /* for roundoff padding for transaction data and one for commit record */ - if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) { - /* log su roundoff */ - unit_bytes += 2 * mp->m_sb.sb_logsunit; - } else { - /* BB roundoff */ - unit_bytes += 2 * BBSIZE; - } + /* roundoff padding for transaction data and one for commit record */ + unit_bytes += 2 * log->l_iclog_roundoff;
return unit_bytes; }
+int +xfs_log_calc_unit_res( + struct xfs_mount *mp, + int unit_bytes) +{ + return xlog_calc_unit_res(mp->m_log, unit_bytes); +} + /* * Allocate and initialise a new log ticket. */ @@ -3437,7 +3428,7 @@ xlog_ticket_alloc(
tic = kmem_cache_zalloc(xfs_log_ticket_zone, GFP_NOFS | __GFP_NOFAIL);
- unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes); + unit_res = xlog_calc_unit_res(log, unit_bytes);
atomic_set(&tic->t_ref, 1); tic->t_task = current; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 1c6fdbf3d506..037950cf1061 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -436,6 +436,8 @@ struct xlog { #endif /* log recovery lsn tracking (for buffer submission */ xfs_lsn_t l_recovery_lsn; + + uint32_t l_iclog_roundoff;/* padding roundoff */ };
#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
From: Geert Uytterhoeven geert@linux-m68k.org
mainline-inclusion from mainline-v5.13-rc4 commit 18842e0a4f48564bbed541947abd8131fd0e9734 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
On 32-bit (e.g. m68k):
ERROR: modpost: "__udivdi3" [fs/xfs/xfs.ko] undefined!
Fix this by using a uint32_t intermediate, like before.
Reported-by: noreply@ellerman.id.au Fixes: 7660a5b48fbef958 ("xfs: log stripe roundoff is a property of the log") Signed-off-by: Geert Uytterhoeven geert@linux-m68k.org Reviewed-by: Dave Chinner dchinner@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_log.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 871922f4d3fe..73f94b3a4466 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3065,8 +3065,8 @@ xlog_state_switch_iclogs(
/* Round up to next log-sunit */ if (log->l_iclog_roundoff > BBSIZE) { - log->l_curr_block = roundup(log->l_curr_block, - BTOBB(log->l_iclog_roundoff)); + uint32_t sunit_bb = BTOBB(log->l_iclog_roundoff); + log->l_curr_block = roundup(log->l_curr_block, sunit_bb); }
if (log->l_curr_block >= log->l_logBBsize) {
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.13-rc4 commit a79b28c284fd910bb291dbf307a26f4d432e88f3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
To allow for iclog IO device cache flush behaviour to be optimised, we first need to separate out the commit record iclog IO from the rest of the checkpoint so we can wait for the checkpoint IO to complete before we issue the commit record.
This separation is only necessary if the commit record is being written into a different iclog to the start of the checkpoint as the upcoming cache flushing changes requires completion ordering against the other iclogs submitted by the checkpoint.
If the entire checkpoint and commit is in the one iclog, then they are both covered by the one set of cache flush primitives on the iclog and hence there is no need to separate them for ordering.
Otherwise, we need to wait for all the previous iclogs to complete so they are ordered correctly and made stable by the REQ_PREFLUSH that the commit record iclog IO issues. This guarantees that if a reader sees the commit record in the journal, they will also see the entire checkpoint that commit record closes off.
This also provides the guarantee that when the commit record IO completes, we can safely unpin all the log items in the checkpoint so they can be written back because the entire checkpoint is stable in the journal.
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Chandan Babu R chandanrlinux@gmail.com Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Allison Henderson allison.henderson@oracle.com Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_log.c | 8 +++++--- fs/xfs/xfs_log_cil.c | 9 +++++++++ fs/xfs/xfs_log_priv.h | 2 ++ 3 files changed, 16 insertions(+), 3 deletions(-)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 73f94b3a4466..f4dbf010b5e5 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -762,10 +762,12 @@ xfs_log_mount_cancel( }
/* - * Wait for the iclog to be written disk, or return an error if the log has been - * shut down. + * Wait for the iclog and all prior iclogs to be written disk as required by the + * log force state machine. Waiting on ic_force_wait ensures iclog completions + * have been ordered and callbacks run before we are woken here, hence + * guaranteeing that all the iclogs up to this one are on stable storage. */ -static int +int xlog_wait_on_iclog( struct xlog_in_core *iclog) __releases(iclog->ic_log->l_icloglock) diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index b0ef071b3cb5..1e5fd6f268c2 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -870,6 +870,15 @@ xlog_cil_push_work( wake_up_all(&cil->xc_commit_wait); spin_unlock(&cil->xc_push_lock);
+ /* + * If the checkpoint spans multiple iclogs, wait for all previous + * iclogs to complete before we submit the commit_iclog. + */ + if (ctx->start_lsn != commit_lsn) { + spin_lock(&log->l_icloglock); + xlog_wait_on_iclog(commit_iclog->ic_prev); + } + /* release the hounds! */ xfs_log_release_iclog(commit_iclog); return; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 037950cf1061..ee7786b33da9 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -584,6 +584,8 @@ xlog_wait( remove_wait_queue(wq, &wait); }
+int xlog_wait_on_iclog(struct xlog_in_core *iclog); + /* * The LSN is valid so long as it is behind the current LSN. If it isn't, this * means that the next log record that includes this metadata could have a
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.13-rc4 commit 0431d926b399d74f1cde2c355d48289c6d7fa882 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
The new checkpoint cache flush mechanism requires us to issue an unconditional cache flush before we start a new checkpoint. We don't want to block for this if we can help it, and we have a fair chunk of CPU work to do between starting the checkpoint and issuing the first journal IO.
Hence it makes sense to amortise the latency cost of the cache flush by issuing it asynchronously and then waiting for it only when we need to issue the first IO in the transaction.
To do this, we need async cache flush primitives to submit the cache flush bio and to wait on it. The block layer has no such primitives for filesystems, so roll our own for the moment.
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Allison Henderson allison.henderson@oracle.com Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_bio_io.c | 35 +++++++++++++++++++++++++++++++++++ fs/xfs/xfs_linux.h | 2 ++ 2 files changed, 37 insertions(+)
diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c index e2148f2d5d6b..b20a9b639acb 100644 --- a/fs/xfs/xfs_bio_io.c +++ b/fs/xfs/xfs_bio_io.c @@ -9,6 +9,41 @@ static inline unsigned int bio_max_vecs(unsigned int count) return min_t(unsigned, howmany(count, PAGE_SIZE), BIO_MAX_PAGES); }
+static void +xfs_flush_bdev_async_endio( + struct bio *bio) +{ + complete(bio->bi_private); +} + +/* + * Submit a request for an async cache flush to run. If the request queue does + * not require flush operations, just skip it altogether. If the caller needs + * to wait for the flush completion at a later point in time, they must supply a + * valid completion. This will be signalled when the flush completes. The + * caller never sees the bio that is issued here. + */ +void +xfs_flush_bdev_async( + struct bio *bio, + struct block_device *bdev, + struct completion *done) +{ + struct request_queue *q = bdev->bd_disk->queue; + + if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { + complete(done); + return; + } + + bio_init(bio, NULL, 0); + bio_set_dev(bio, bdev); + bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; + bio->bi_private = done; + bio->bi_end_io = xfs_flush_bdev_async_endio; + + submit_bio(bio); +} int xfs_rw_bdev( struct block_device *bdev, diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 5b7a1e201559..08e3bd3f859a 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -197,6 +197,8 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y)
int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count, char *data, unsigned int op); +void xfs_flush_bdev_async(struct bio *bio, struct block_device *bdev, + struct completion *done);
#define ASSERT_ALWAYS(expr) \ (likely(expr) ? (void)0 : assfail(NULL, #expr, __FILE__, __LINE__))
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.13-rc4 commit bad77c375e8de6c776c848e443f7dc2d0d909be5 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
xfs: CIL checkpoint flushes caches unconditionally
Currently every journal IO is issued as REQ_PREFLUSH | REQ_FUA to guarantee the ordering requirements the journal has w.r.t. metadata writeback. THe two ordering constraints are:
1. we cannot overwrite metadata in the journal until we guarantee that the dirty metadata has been written back in place and is stable.
2. we cannot write back dirty metadata until it has been written to the journal and guaranteed to be stable (and hence recoverable) in the journal.
These rules apply to the atomic transactions recorded in the journal, not to the journal IO itself. Hence we need to ensure metadata is stable before we start writing a new transaction to the journal (guarantee #1), and we need to ensure the entire transaction is stable in the journal before we start metadata writeback (guarantee #2).
The ordering guarantees of #1 are currently provided by REQ_PREFLUSH being added to every iclog IO. This causes the journal IO to issue a cache flush and wait for it to complete before issuing the write IO to the journal. Hence all completed metadata IO is guaranteed to be stable before the journal overwrites the old metadata.
However, for long running CIL checkpoints that might do a thousand journal IOs, we don't need every single one of these iclog IOs to issue a cache flush - the cache flush done before the first iclog is submitted is sufficient to cover the entire range in the log that the checkpoint will overwrite because the CIL space reservation guarantees the tail of the log (completed metadata) is already beyond the range of the checkpoint write.
Hence we only need a full cache flush between closing off the CIL checkpoint context (i.e. when the push switches it out) and issuing the first journal IO. Rather than plumbing this through to the journal IO, we can start this cache flush the moment the CIL context is owned exclusively by the push worker. The cache flush can be in progress while we process the CIL ready for writing, hence reducing the latency of the initial iclog write. This is especially true for large checkpoints, where we might have to process hundreds of thousands of log vectors before we issue the first iclog write. In these cases, it is likely the cache flush has already been completed by the time we have built the CIL log vector chain.
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Chandan Babu R chandanrlinux@gmail.com Reviewed-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Allison Henderson allison.henderson@oracle.com Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_log_cil.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-)
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 1e5fd6f268c2..7b8b7ac85ea9 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -656,6 +656,8 @@ xlog_cil_push_work( struct xfs_log_vec lvhdr = { NULL }; xfs_lsn_t commit_lsn; xfs_lsn_t push_seq; + struct bio bio; + DECLARE_COMPLETION_ONSTACK(bdev_flush);
new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_NOFS); new_ctx->ticket = xlog_cil_ticket_alloc(log); @@ -719,10 +721,19 @@ xlog_cil_push_work( spin_unlock(&cil->xc_push_lock);
/* - * pull all the log vectors off the items in the CIL, and - * remove the items from the CIL. We don't need the CIL lock - * here because it's only needed on the transaction commit - * side which is currently locked out by the flush lock. + * The CIL is stable at this point - nothing new will be added to it + * because we hold the flush lock exclusively. Hence we can now issue + * a cache flush to ensure all the completed metadata in the journal we + * are about to overwrite is on stable storage. + */ + xfs_flush_bdev_async(&bio, log->l_mp->m_ddev_targp->bt_bdev, + &bdev_flush); + + /* + * Pull all the log vectors off the items in the CIL, and remove the + * items from the CIL. We don't need the CIL lock here because it's only + * needed on the transaction commit side which is currently locked out + * by the flush lock. */ lv = NULL; num_iovecs = 0; @@ -806,6 +817,12 @@ xlog_cil_push_work( lvhdr.lv_iovecp = &lhdr; lvhdr.lv_next = ctx->lv_chain;
+ /* + * Before we format and submit the first iclog, we have to ensure that + * the metadata writeback ordering cache flush is complete. + */ + wait_for_completion(&bdev_flush); + error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0, true); if (error) goto out_abort_free_ticket;
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.13-rc4 commit 3468bb1ca6e8840789e13c7b9d8b0c556b4fbe79 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
The CIL push is the only call to xlog_write that sets this variable to true. The other callers don't need a start rec, and they tell xlog_write what to do by passing the type of ophdr they need written in the flags field. The need_start_rec parameter essentially tells xlog_write to to write an extra ophdr with a XLOG_START_TRANS type, so get rid of the variable to do this and pass XLOG_START_TRANS as the flag value into xlog_write() from the CIL push.
$ size fs/xfs/xfs_log.o* text data bss dec hex filename 27595 560 8 28163 6e03 fs/xfs/xfs_log.o.orig 27454 560 8 28022 6d76 fs/xfs/xfs_log.o.patched
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Chandan Babu R chandanrlinux@gmail.com Reviewed-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Allison Henderson allison.henderson@oracle.com Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_log.c | 44 +++++++++++++++++++++---------------------- fs/xfs/xfs_log_cil.c | 3 ++- fs/xfs/xfs_log_priv.h | 3 +-- 3 files changed, 25 insertions(+), 25 deletions(-)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index f4dbf010b5e5..1e6e5735ddcd 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -796,9 +796,7 @@ xlog_wait_on_iclog( static int xlog_write_unmount_record( struct xlog *log, - struct xlog_ticket *ticket, - xfs_lsn_t *lsn, - uint flags) + struct xlog_ticket *ticket) { struct xfs_unmount_log_format ulf = { .magic = XLOG_UNMOUNT_TYPE, @@ -815,7 +813,7 @@ xlog_write_unmount_record(
/* account for space used by record data */ ticket->t_curr_res -= sizeof(ulf); - return xlog_write(log, &vec, ticket, lsn, NULL, flags, false); + return xlog_write(log, &vec, ticket, NULL, NULL, XLOG_UNMOUNT_TRANS); }
/* @@ -829,15 +827,13 @@ xlog_unmount_write( struct xfs_mount *mp = log->l_mp; struct xlog_in_core *iclog; struct xlog_ticket *tic = NULL; - xfs_lsn_t lsn; - uint flags = XLOG_UNMOUNT_TRANS; int error;
error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0); if (error) goto out_err;
- error = xlog_write_unmount_record(log, tic, &lsn, flags); + error = xlog_write_unmount_record(log, tic); /* * At this point, we're umounting anyway, so there's no point in * transitioning log state to IOERROR. Just continue... @@ -1474,8 +1470,7 @@ xlog_commit_record( if (XLOG_FORCED_SHUTDOWN(log)) return -EIO;
- error = xlog_write(log, &vec, ticket, lsn, iclog, XLOG_COMMIT_TRANS, - false); + error = xlog_write(log, &vec, ticket, lsn, iclog, XLOG_COMMIT_TRANS); if (error) xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); return error; @@ -2072,13 +2067,16 @@ static int xlog_write_calc_vec_length( struct xlog_ticket *ticket, struct xfs_log_vec *log_vector, - bool need_start_rec) + uint optype) { struct xfs_log_vec *lv; - int headers = need_start_rec ? 1 : 0; + int headers = 0; int len = 0; int i;
+ if (optype & XLOG_START_TRANS) + headers++; + for (lv = log_vector; lv; lv = lv->lv_next) { /* we don't write ordered log vectors */ if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) @@ -2298,8 +2296,7 @@ xlog_write( struct xlog_ticket *ticket, xfs_lsn_t *start_lsn, struct xlog_in_core **commit_iclog, - uint flags, - bool need_start_rec) + uint optype) { struct xlog_in_core *iclog = NULL; struct xfs_log_vec *lv = log_vector; @@ -2327,8 +2324,9 @@ xlog_write( xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); }
- len = xlog_write_calc_vec_length(ticket, log_vector, need_start_rec); - *start_lsn = 0; + len = xlog_write_calc_vec_length(ticket, log_vector, optype); + if (start_lsn) + *start_lsn = 0; while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { void *ptr; int log_offset; @@ -2342,7 +2340,7 @@ xlog_write( ptr = iclog->ic_datap + log_offset;
/* start_lsn is the first lsn written to. That's all we need. */ - if (!*start_lsn) + if (start_lsn && !*start_lsn) *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
/* @@ -2355,6 +2353,7 @@ xlog_write( int copy_len; int copy_off; bool ordered = false; + bool wrote_start_rec = false;
/* ordered log vectors have no regions to write */ if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) { @@ -2372,13 +2371,15 @@ xlog_write( * write a start record. Only do this for the first * iclog we write to. */ - if (need_start_rec) { + if (optype & XLOG_START_TRANS) { xlog_write_start_rec(ptr, ticket); xlog_write_adv_cnt(&ptr, &len, &log_offset, sizeof(struct xlog_op_header)); + optype &= ~XLOG_START_TRANS; + wrote_start_rec = true; }
- ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags); + ophdr = xlog_write_setup_ophdr(log, ptr, ticket, optype); if (!ophdr) return -EIO;
@@ -2409,14 +2410,13 @@ xlog_write( } copy_len += sizeof(struct xlog_op_header); record_cnt++; - if (need_start_rec) { + if (wrote_start_rec) { copy_len += sizeof(struct xlog_op_header); record_cnt++; - need_start_rec = false; } data_cnt += contwr ? copy_len : 0;
- error = xlog_write_copy_finish(log, iclog, flags, + error = xlog_write_copy_finish(log, iclog, optype, &record_cnt, &data_cnt, &partial_copy, &partial_copy_len, @@ -2460,7 +2460,7 @@ xlog_write( spin_lock(&log->l_icloglock); xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); if (commit_iclog) { - ASSERT(flags & XLOG_COMMIT_TRANS); + ASSERT(optype & XLOG_COMMIT_TRANS); *commit_iclog = iclog; } else { error = xlog_state_release_iclog(log, iclog); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 7b8b7ac85ea9..172bb3551d6b 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -823,7 +823,8 @@ xlog_cil_push_work( */ wait_for_completion(&bdev_flush);
- error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0, true); + error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, + XLOG_START_TRANS); if (error) goto out_abort_free_ticket;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index ee7786b33da9..56e1942c47df 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -480,8 +480,7 @@ void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); void xlog_print_trans(struct xfs_trans *); int xlog_write(struct xlog *log, struct xfs_log_vec *log_vector, struct xlog_ticket *tic, xfs_lsn_t *start_lsn, - struct xlog_in_core **commit_iclog, uint flags, - bool need_start_rec); + struct xlog_in_core **commit_iclog, uint optype); int xlog_commit_record(struct xlog *log, struct xlog_ticket *ticket, struct xlog_in_core **iclog, xfs_lsn_t *lsn); void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket);
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.13-rc4 commit eef983ffeae7a1cdde8c3338155ae2dd74b8621b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Currently every journal IO is issued as REQ_PREFLUSH | REQ_FUA to guarantee the ordering requirements the journal has w.r.t. metadata writeback. THe two ordering constraints are:
1. we cannot overwrite metadata in the journal until we guarantee that the dirty metadata has been written back in place and is stable.
2. we cannot write back dirty metadata until it has been written to the journal and guaranteed to be stable (and hence recoverable) in the journal.
The ordering guarantees of #1 are provided by REQ_PREFLUSH. This causes the journal IO to issue a cache flush and wait for it to complete before issuing the write IO to the journal. Hence all completed metadata IO is guaranteed to be stable before the journal overwrites the old metadata.
The ordering guarantees of #2 are provided by the REQ_FUA, which ensures the journal writes do not complete until they are on stable storage. Hence by the time the last journal IO in a checkpoint completes, we know that the entire checkpoint is on stable storage and we can unpin the dirty metadata and allow it to be written back.
This is the mechanism by which ordering was first implemented in XFS way back in 2002 by commit 95d97c36e5155075ba2eb22b17562cfcc53fcf96 ("Add support for drive write cache flushing") in the xfs-archive tree.
A lot has changed since then, most notably we now use delayed logging to checkpoint the filesystem to the journal rather than write each individual transaction to the journal. Cache flushes on journal IO are necessary when individual transactions are wholly contained within a single iclog. However, CIL checkpoints are single transactions that typically span hundreds to thousands of individual journal writes, and so the requirements for device cache flushing have changed.
That is, the ordering rules I state above apply to ordering of atomic transactions recorded in the journal, not to the journal IO itself. Hence we need to ensure metadata is stable before we start writing a new transaction to the journal (guarantee #1), and we need to ensure the entire transaction is stable in the journal before we start metadata writeback (guarantee #2).
Hence we only need a REQ_PREFLUSH on the journal IO that starts a new journal transaction to provide #1, and it is not on any other journal IO done within the context of that journal transaction.
The CIL checkpoint already issues a cache flush before it starts writing to the log, so we no longer need the iclog IO to issue a REQ_REFLUSH for us. Hence if XLOG_START_TRANS is passed to xlog_write(), we no longer need to mark the first iclog in the log write with REQ_PREFLUSH for this case. As an added bonus, this ordering mechanism works for both internal and external logs, meaning we can remove the explicit data device cache flushes from the iclog write code when using external logs.
Given the new ordering semantics of commit records for the CIL, we need iclogs containing commit records to issue a REQ_PREFLUSH. We also require unmount records to do this. Hence for both XLOG_COMMIT_TRANS and XLOG_UNMOUNT_TRANS xlog_write() calls we need to mark the first iclog being written with REQ_PREFLUSH.
For both commit records and unmount records, we also want them immediately on stable storage, so we want to also mark the iclogs that contain these records to be marked REQ_FUA. That means if a record is split across multiple iclogs, they are all marked REQ_FUA and not just the last one so that when the transaction is completed all the parts of the record are on stable storage.
And for external logs, unmount records need a pre-write data device cache flush similar to the CIL checkpoint cache pre-flush as the internal iclog write code does not do this implicitly anymore.
As an optimisation, when the commit record lands in the same iclog as the journal transaction starts, we don't need to wait for anything and can simply use REQ_FUA to provide guarantee #2. This means that for fsync() heavy workloads, the cache flush behaviour is completely unchanged and there is no degradation in performance as a result of optimise the multi-IO transaction case.
The most notable sign that there is less IO latency on my test machine (nvme SSDs) is that the "noiclogs" rate has dropped substantially. This metric indicates that the CIL push is blocking in xlog_get_iclog_space() waiting for iclog IO completion to occur. With 8 iclogs of 256kB, the rate is appoximately 1 noiclog event to every 4 iclog writes. IOWs, every 4th call to xlog_get_iclog_space() is blocking waiting for log IO. With the changes in this patch, this drops to 1 noiclog event for every 100 iclog writes. Hence it is clear that log IO is completing much faster than it was previously, but it is also clear that for large iclog sizes, this isn't the performance limiting factor on this hardware.
With smaller iclogs (32kB), however, there is a substantial difference. With the cache flush modifications, the journal is now running at over 4000 write IOPS, and the journal throughput is largely identical to the 256kB iclogs and the noiclog event rate stays low at about 1:50 iclog writes. The existing code tops out at about 2500 IOPS as the number of cache flushes dominate performance and latency. The noiclog event rate is about 1:4, and the performance variance is quite large as the journal throughput can fall to less than half the peak sustained rate when the cache flush rate prevents metadata writeback from keeping up and the log runs out of space and throttles reservations.
As a result:
logbsize fsmark create rate rm -rf before 32kb 152851+/-5.3e+04 5m28s patched 32kb 221533+/-1.1e+04 5m24s
before 256kb 220239+/-6.2e+03 4m58s patched 256kb 228286+/-9.2e+03 5m06s
The rm -rf times are included because I ran them, but the differences are largely noise. This workload is largely metadata read IO latency bound and the changes to the journal cache flushing doesn't really make any noticable difference to behaviour apart from a reduction in noiclog events from background CIL pushing.
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Chandan Babu R chandanrlinux@gmail.com Reviewed-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Allison Henderson allison.henderson@oracle.com Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_log.c | 65 +++++++++++++++---------------------------- fs/xfs/xfs_log.h | 1 - fs/xfs/xfs_log_cil.c | 18 +++++++++--- fs/xfs/xfs_log_priv.h | 6 ++++ 4 files changed, 43 insertions(+), 47 deletions(-)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 1e6e5735ddcd..001307f40f99 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -489,7 +489,7 @@ __xlog_state_release_iclog( * Flush iclog to disk if this is the last reference to the given iclog and the * it is in the WANT_SYNC state. */ -static int +int xlog_state_release_iclog( struct xlog *log, struct xlog_in_core *iclog) @@ -509,23 +509,6 @@ xlog_state_release_iclog( return 0; }
-void -xfs_log_release_iclog( - struct xlog_in_core *iclog) -{ - struct xlog *log = iclog->ic_log; - bool sync = false; - - if (atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) { - if (iclog->ic_state != XLOG_STATE_IOERROR) - sync = __xlog_state_release_iclog(log, iclog); - spin_unlock(&log->l_icloglock); - } - - if (sync) - xlog_sync(log, iclog); -} - /* * Mount a log filesystem * @@ -813,6 +796,14 @@ xlog_write_unmount_record(
/* account for space used by record data */ ticket->t_curr_res -= sizeof(ulf); + + /* + * For external log devices, we need to flush the data device cache + * first to ensure all metadata writeback is on stable storage before we + * stamp the tail LSN into the unmount record. + */ + if (log->l_targ != log->l_mp->m_ddev_targp) + xfs_blkdev_issue_flush(log->l_targ); return xlog_write(log, &vec, ticket, NULL, NULL, XLOG_UNMOUNT_TRANS); }
@@ -850,6 +841,11 @@ xlog_unmount_write( else ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || iclog->ic_state == XLOG_STATE_IOERROR); + /* + * Ensure the journal is fully flushed and on stable storage once the + * iclog containing the unmount record is written. + */ + iclog->ic_flags |= (XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA); error = xlog_state_release_iclog(log, iclog); xlog_wait_on_iclog(iclog);
@@ -1676,8 +1672,7 @@ xlog_write_iclog( struct xlog *log, struct xlog_in_core *iclog, uint64_t bno, - unsigned int count, - bool need_flush) + unsigned int count) { ASSERT(bno < log->l_logBBsize);
@@ -1715,10 +1710,12 @@ xlog_write_iclog( * writeback throttle from throttling log writes behind background * metadata writeback and causing priority inversions. */ - iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | - REQ_IDLE | REQ_FUA; - if (need_flush) + iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_IDLE; + if (iclog->ic_flags & XLOG_ICL_NEED_FLUSH) iclog->ic_bio.bi_opf |= REQ_PREFLUSH; + if (iclog->ic_flags & XLOG_ICL_NEED_FUA) + iclog->ic_bio.bi_opf |= REQ_FUA; + iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA);
if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) { xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); @@ -1821,7 +1818,6 @@ xlog_sync( unsigned int roundoff; /* roundoff to BB or stripe */ uint64_t bno; unsigned int size; - bool need_flush = true, split = false;
ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
@@ -1846,10 +1842,8 @@ xlog_sync( bno = BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn));
/* Do we need to split this write into 2 parts? */ - if (bno + BTOBB(count) > log->l_logBBsize) { + if (bno + BTOBB(count) > log->l_logBBsize) xlog_split_iclog(log, &iclog->ic_header, bno, count); - split = true; - }
/* calculcate the checksum */ iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header, @@ -1871,21 +1865,8 @@ xlog_sync( } #endif
- /* - * Flush the data device before flushing the log to make sure all meta - * data written back from the AIL actually made it to disk before - * stamping the new log tail LSN into the log buffer. For an external - * log we need to issue the flush explicitly, and unfortunately - * synchronously here; for an internal log we can simply use the block - * layer state machine for preflushes. - */ - if (log->l_targ != log->l_mp->m_ddev_targp || split) { - xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp); - need_flush = false; - } - xlog_verify_iclog(log, iclog, count); - xlog_write_iclog(log, iclog, bno, count, need_flush); + xlog_write_iclog(log, iclog, bno, count); }
/* @@ -2339,7 +2320,7 @@ xlog_write( ASSERT(log_offset <= iclog->ic_size - 1); ptr = iclog->ic_datap + log_offset;
- /* start_lsn is the first lsn written to. That's all we need. */ + /* Start_lsn is the first lsn written to. */ if (start_lsn && !*start_lsn) *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 58c3fcbec94a..7c8a775781d3 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -117,7 +117,6 @@ void xfs_log_mount_cancel(struct xfs_mount *); xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp); void xfs_log_space_wake(struct xfs_mount *mp); -void xfs_log_release_iclog(struct xlog_in_core *iclog); int xfs_log_reserve(struct xfs_mount *mp, int length, int count, diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 172bb3551d6b..9d2fa8464289 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -890,15 +890,25 @@ xlog_cil_push_work(
/* * If the checkpoint spans multiple iclogs, wait for all previous - * iclogs to complete before we submit the commit_iclog. + * iclogs to complete before we submit the commit_iclog. In this case, + * the commit_iclog write needs to issue a pre-flush so that the + * ordering is correctly preserved down to stable storage. */ + spin_lock(&log->l_icloglock); if (ctx->start_lsn != commit_lsn) { - spin_lock(&log->l_icloglock); xlog_wait_on_iclog(commit_iclog->ic_prev); + spin_lock(&log->l_icloglock); + commit_iclog->ic_flags |= XLOG_ICL_NEED_FLUSH; }
- /* release the hounds! */ - xfs_log_release_iclog(commit_iclog); + /* + * The commit iclog must be written to stable storage to guarantee + * journal IO vs metadata writeback IO is correctly ordered on stable + * storage. + */ + commit_iclog->ic_flags |= XLOG_ICL_NEED_FUA; + xlog_state_release_iclog(log, commit_iclog); + spin_unlock(&log->l_icloglock); return;
out_skip: diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 56e1942c47df..2203ccecafb6 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -133,6 +133,9 @@ enum xlog_iclog_state {
#define XLOG_COVER_OPS 5
+#define XLOG_ICL_NEED_FLUSH (1 << 0) /* iclog needs REQ_PREFLUSH */ +#define XLOG_ICL_NEED_FUA (1 << 1) /* iclog needs REQ_FUA */ + /* Ticket reservation region accounting */ #define XLOG_TIC_LEN_MAX 15
@@ -201,6 +204,7 @@ typedef struct xlog_in_core { u32 ic_size; u32 ic_offset; enum xlog_iclog_state ic_state; + unsigned int ic_flags; char *ic_datap; /* pointer to iclog data */
/* Callback structures need their own cacheline */ @@ -486,6 +490,8 @@ int xlog_commit_record(struct xlog *log, struct xlog_ticket *ticket, void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket); void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket);
+int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog); + /* * When we crack an atomic LSN, we sample it first so that the value will not * change while we are cracking it into the component values. This means we
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.13-rc4 commit 19f4e7cc819771812a7f527d7897c2deffbf7a00 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
A hang with tasks stuck on the CIL hard throttle was reported and largely diagnosed by Donald Buczek, who discovered that it was a result of the CIL context space usage decrementing in committed transactions once the hard throttle limit had been hit and processes were already blocked. This resulted in the CIL push not waking up those waiters because the CIL context was no longer over the hard throttle limit.
The surprising aspect of this was the CIL space usage going backwards regularly enough to trigger this situation. Assumptions had been made in design that the relogging process would only increase the size of the objects in the CIL, and so that space would only increase.
This change and commit message fixes the issue and documents the result of an audit of the triggers that can cause the CIL space to go backwards, how large the backwards steps tend to be, the frequency in which they occur, and what the impact on the CIL accounting code is.
Even though the CIL ctx->space_used can go backwards, it will only do so if the log item is already logged to the CIL and contains a space reservation for it's entire logged state. This is tracked by the shadow buffer state on the log item. If the item is not previously logged in the CIL it has no shadow buffer nor log vector, and hence the entire size of the logged item copied to the log vector is accounted to the CIL space usage. i.e. it will always go up in this case.
If the item has a log vector (i.e. already in the CIL) and the size decreases, then the existing log vector will be overwritten and the space usage will go down. This is the only condition where the space usage reduces, and it can only occur when an item is already tracked in the CIL. Hence we are safe from CIL space usage underruns as a result of log items decreasing in size when they are relogged.
Typically this reduction in CIL usage occurs from metadata blocks being free, such as when a btree block merge occurs or a directory enter/xattr entry is removed and the da-tree is reduced in size. This generally results in a reduction in size of around a single block in the CIL, but also tends to increase the number of log vectors because the parent and sibling nodes in the tree needs to be updated when a btree block is removed. If a multi-level merge occurs, then we see reduction in size of 2+ blocks, but again the log vector count goes up.
The other vector is inode fork size changes, which only log the current size of the fork and ignore the previously logged size when the fork is relogged. Hence if we are removing items from the inode fork (dir/xattr removal in shortform, extent record removal in extent form, etc) the relogged size of the inode for can decrease.
No other log items can decrease in size either because they are a fixed size (e.g. dquots) or they cannot be relogged (e.g. relogging an intent actually creates a new intent log item and doesn't relog the old item at all.) Hence the only two vectors for CIL context size reduction are relogging inode forks and marking buffers active in the CIL as stale.
Long story short: the majority of the code does the right thing and handles the reduction in log item size correctly, and only the CIL hard throttle implementation is problematic and needs fixing. This patch makes that fix, as well as adds comments in the log item code that result in items shrinking in size when they are relogged as a clear reminder that this can and does happen frequently.
The throttle fix is based upon the change Donald proposed, though it goes further to ensure that once the throttle is activated, it captures all tasks until the CIL push issues a wakeup, regardless of whether the CIL space used has gone back under the throttle threshold.
This ensures that we prevent tasks reducing the CIL slightly under the throttle threshold and then making more changes that push it well over the throttle limit. This is acheived by checking if the throttle wait queue is already active as a condition of throttling. Hence once we start throttling, we continue to apply the throttle until the CIL context push wakes everything on the wait queue.
We can use waitqueue_active() for the waitqueue manipulations and checks as they are all done under the ctx->xc_push_lock. Hence the waitqueue has external serialisation and we can safely peek inside the wait queue without holding the internal waitqueue locks.
Many thanks to Donald for his diagnostic and analysis work to isolate the cause of this hang.
Reported-and-tested-by: Donald Buczek buczek@molgen.mpg.de Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Chandan Babu R chandanrlinux@gmail.com Reviewed-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Allison Henderson allison.henderson@oracle.com Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_buf_item.c | 37 ++++++++++++++++++------------------- fs/xfs/xfs_inode_item.c | 14 ++++++++++++++ fs/xfs/xfs_log_cil.c | 22 +++++++++++++++++----- 3 files changed, 49 insertions(+), 24 deletions(-)
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 0356f2e340a1..8c6e26d62ef2 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -56,14 +56,12 @@ xfs_buf_log_format_size( }
/* - * This returns the number of log iovecs needed to log the - * given buf log item. + * Return the number of log iovecs and space needed to log the given buf log + * item segment. * - * It calculates this as 1 iovec for the buf log format structure - * and 1 for each stretch of non-contiguous chunks to be logged. - * Contiguous chunks are logged in a single iovec. - * - * If the XFS_BLI_STALE flag has been set, then log nothing. + * It calculates this as 1 iovec for the buf log format structure and 1 for each + * stretch of non-contiguous chunks to be logged. Contiguous chunks are logged + * in a single iovec. */ STATIC void xfs_buf_item_size_segment( @@ -119,11 +117,8 @@ xfs_buf_item_size_segment( }
/* - * This returns the number of log iovecs needed to log the given buf log item. - * - * It calculates this as 1 iovec for the buf log format structure and 1 for each - * stretch of non-contiguous chunks to be logged. Contiguous chunks are logged - * in a single iovec. + * Return the number of log iovecs and space needed to log the given buf log + * item. * * Discontiguous buffers need a format structure per region that is being * logged. This makes the changes in the buffer appear to log recovery as though @@ -133,7 +128,11 @@ xfs_buf_item_size_segment( * what ends up on disk. * * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log - * format structures. + * format structures. If the item has previously been logged and has dirty + * regions, we do not relog them in stale buffers. This has the effect of + * reducing the size of the relogged item by the amount of dirty data tracked + * by the log item. This can result in the committing transaction reducing the + * amount of space being consumed by the CIL. */ STATIC void xfs_buf_item_size( @@ -147,9 +146,9 @@ xfs_buf_item_size( ASSERT(atomic_read(&bip->bli_refcount) > 0); if (bip->bli_flags & XFS_BLI_STALE) { /* - * The buffer is stale, so all we need to log - * is the buf log format structure with the - * cancel flag in it. + * The buffer is stale, so all we need to log is the buf log + * format structure with the cancel flag in it as we are never + * going to replay the changes tracked in the log item. */ trace_xfs_buf_item_size_stale(bip); ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); @@ -164,9 +163,9 @@ xfs_buf_item_size(
if (bip->bli_flags & XFS_BLI_ORDERED) { /* - * The buffer has been logged just to order it. - * It is not being included in the transaction - * commit, so no vectors are used at all. + * The buffer has been logged just to order it. It is not being + * included in the transaction commit, so no vectors are used at + * all. */ trace_xfs_buf_item_size_ordered(bip); *nvecs = XFS_LOG_VEC_ORDERED; diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 17e20a6d8b4e..6ff91e5bf3cd 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -28,6 +28,20 @@ static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip) return container_of(lip, struct xfs_inode_log_item, ili_item); }
+/* + * The logged size of an inode fork is always the current size of the inode + * fork. This means that when an inode fork is relogged, the size of the logged + * region is determined by the current state, not the combination of the + * previously logged state + the current state. This is different relogging + * behaviour to most other log items which will retain the size of the + * previously logged changes when smaller regions are relogged. + * + * Hence operations that remove data from the inode fork (e.g. shortform + * dir/attr remove, extent form extent removal, etc), the size of the relogged + * inode gets -smaller- rather than stays the same size as the previously logged + * size and this can result in the committing transaction reducing the amount of + * space being consumed by the CIL. + */ STATIC void xfs_inode_item_data_fork_size( struct xfs_inode_log_item *iip, diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 9d2fa8464289..903617e6d054 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -670,9 +670,14 @@ xlog_cil_push_work( ASSERT(push_seq <= ctx->sequence);
/* - * Wake up any background push waiters now this context is being pushed. + * As we are about to switch to a new, empty CIL context, we no longer + * need to throttle tasks on CIL space overruns. Wake any waiters that + * the hard push throttle may have caught so they can start committing + * to the new context. The ctx->xc_push_lock provides the serialisation + * necessary for safely using the lockless waitqueue_active() check in + * this context. */ - if (ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) + if (waitqueue_active(&cil->xc_push_wait)) wake_up_all(&cil->xc_push_wait);
/* @@ -944,7 +949,7 @@ xlog_cil_push_background( ASSERT(!list_empty(&cil->xc_cil));
/* - * don't do a background push if we haven't used up all the + * Don't do a background push if we haven't used up all the * space available yet. */ if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) { @@ -968,9 +973,16 @@ xlog_cil_push_background(
/* * If we are well over the space limit, throttle the work that is being - * done until the push work on this context has begun. + * done until the push work on this context has begun. Enforce the hard + * throttle on all transaction commits once it has been activated, even + * if the committing transactions have resulted in the space usage + * dipping back down under the hard limit. + * + * The ctx->xc_push_lock provides the serialisation necessary for safely + * using the lockless waitqueue_active() check in this context. */ - if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) { + if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log) || + waitqueue_active(&cil->xc_push_wait)) { trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket); ASSERT(cil->xc_ctx->space_used < log->l_logsize); xlog_wait(&cil->xc_push_wait, &cil->xc_push_lock);
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.13-rc4 commit 5f9b4b0de8dc2fb8eb655463b438001c111570fe category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
In doing an investigation into AIL push stalls, I was looking at the log force code to see if an async CIL push could be done instead. This lead me to xfs_log_force_lsn() and looking at how it works.
xfs_log_force_lsn() is only called from inode synchronisation contexts such as fsync(), and it takes the ip->i_itemp->ili_last_lsn value as the LSN to sync the log to. This gets passed to xlog_cil_force_lsn() via xfs_log_force_lsn() to flush the CIL to the journal, and then used by xfs_log_force_lsn() to flush the iclogs to the journal.
The problem is that ip->i_itemp->ili_last_lsn does not store a log sequence number. What it stores is passed to it from the ->iop_committing method, which is called by xfs_log_commit_cil(). The value this passes to the iop_committing method is the CIL context sequence number that the item was committed to.
As it turns out, xlog_cil_force_lsn() converts the sequence to an actual commit LSN for the related context and returns that to xfs_log_force_lsn(). xfs_log_force_lsn() overwrites it's "lsn" variable that contained a sequence with an actual LSN and then uses that to sync the iclogs.
This caused me some confusion for a while, even though I originally wrote all this code a decade ago. ->iop_committing is only used by a couple of log item types, and only inode items use the sequence number it is passed.
Let's clean up the API, CIL structures and inode log item to call it a sequence number, and make it clear that the high level code is using CIL sequence numbers and not on-disk LSNs for integrity synchronisation purposes.
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Allison Henderson allison.henderson@oracle.com Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/libxfs/xfs_types.h | 1 + fs/xfs/xfs_buf_item.c | 2 +- fs/xfs/xfs_dquot_item.c | 2 +- fs/xfs/xfs_file.c | 10 +++++----- fs/xfs/xfs_inode.c | 10 +++++----- fs/xfs/xfs_inode_item.c | 4 ++-- fs/xfs/xfs_inode_item.h | 2 +- fs/xfs/xfs_log.c | 27 ++++++++++++++------------- fs/xfs/xfs_log.h | 4 +--- fs/xfs/xfs_log_cil.c | 30 +++++++++++------------------- fs/xfs/xfs_log_priv.h | 15 +++++++-------- fs/xfs/xfs_trans.c | 6 +++--- fs/xfs/xfs_trans.h | 4 ++-- 13 files changed, 54 insertions(+), 63 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 397d94775440..1ce06173c2f5 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -21,6 +21,7 @@ typedef int32_t xfs_suminfo_t; /* type of bitmap summary info */ typedef uint32_t xfs_rtword_t; /* word type for bitmap manipulations */
typedef int64_t xfs_lsn_t; /* log sequence number */ +typedef int64_t xfs_csn_t; /* CIL sequence number */
typedef uint32_t xfs_dablk_t; /* dir/attr block number (in file) */ typedef uint32_t xfs_dahash_t; /* dir/attr hash value */ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 8c6e26d62ef2..5d6535370f87 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -632,7 +632,7 @@ xfs_buf_item_release( STATIC void xfs_buf_item_committing( struct xfs_log_item *lip, - xfs_lsn_t commit_lsn) + xfs_csn_t seq) { return xfs_buf_item_release(lip); } diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 8c1fdf37ee8f..8ed47b739b6c 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -188,7 +188,7 @@ xfs_qm_dquot_logitem_release( STATIC void xfs_qm_dquot_logitem_committing( struct xfs_log_item *lip, - xfs_lsn_t commit_lsn) + xfs_csn_t seq) { return xfs_qm_dquot_logitem_release(lip); } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 5fedb04b0cc3..a66afc7ddfdd 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -131,7 +131,7 @@ xfs_file_fsync( struct xfs_mount *mp = ip->i_mount; int error = 0; int log_flushed = 0; - xfs_lsn_t lsn = 0; + xfs_csn_t seq = 0;
trace_xfs_file_fsync(ip);
@@ -163,7 +163,7 @@ xfs_file_fsync( * that we don't get a racing sync operation that does not wait for the * metadata to hit the journal before returning. If we race with * clearing the ili_fsync_fields, then all that will happen is the log - * force will do nothing as the lsn will already be on disk. We can't + * force will do nothing as the seq will already be on disk. We can't * race with setting ili_fsync_fields because that is done under * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared * until after the ili_fsync_fields is cleared. @@ -172,11 +172,11 @@ xfs_file_fsync( if (xfs_ipincount(ip)) { if (!datasync || (iip->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) - lsn = iip->ili_last_lsn; + seq = iip->ili_commit_seq; }
- if (lsn) { - error = xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); + if (seq) { + error = xfs_log_force_seq(mp, seq, XFS_LOG_SYNC, &log_flushed); spin_lock(&iip->ili_lock); iip->ili_fsync_fields = 0; spin_unlock(&iip->ili_lock); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index b4557a827666..4db3c0b7d71b 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2754,7 +2754,7 @@ xfs_iunpin( trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
/* Give the log a push to start the unpinning I/O */ - xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0, NULL); + xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL);
}
@@ -3709,16 +3709,16 @@ int xfs_log_force_inode( struct xfs_inode *ip) { - xfs_lsn_t lsn = 0; + xfs_csn_t seq = 0;
xfs_ilock(ip, XFS_ILOCK_SHARED); if (xfs_ipincount(ip)) - lsn = ip->i_itemp->ili_last_lsn; + seq = ip->i_itemp->ili_commit_seq; xfs_iunlock(ip, XFS_ILOCK_SHARED);
- if (!lsn) + if (!seq) return 0; - return xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, NULL); + return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, NULL); }
/* diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 6ff91e5bf3cd..3aba4559469f 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -617,9 +617,9 @@ xfs_inode_item_committed( STATIC void xfs_inode_item_committing( struct xfs_log_item *lip, - xfs_lsn_t commit_lsn) + xfs_csn_t seq) { - INODE_ITEM(lip)->ili_last_lsn = commit_lsn; + INODE_ITEM(lip)->ili_commit_seq = seq; return xfs_inode_item_release(lip); }
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 4b926e32831c..403b45ab9aa2 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -33,7 +33,7 @@ struct xfs_inode_log_item { unsigned int ili_fields; /* fields to be logged */ unsigned int ili_fsync_fields; /* logged since last fsync */ xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ - xfs_lsn_t ili_last_lsn; /* lsn at last transaction */ + xfs_csn_t ili_commit_seq; /* last transaction commit */ };
static inline int xfs_inode_clean(struct xfs_inode *ip) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 001307f40f99..befd46346786 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3175,14 +3175,13 @@ xfs_log_force( }
static int -__xfs_log_force_lsn( - struct xfs_mount *mp, +xlog_force_lsn( + struct xlog *log, xfs_lsn_t lsn, uint flags, int *log_flushed, bool already_slept) { - struct xlog *log = mp->m_log; struct xlog_in_core *iclog;
spin_lock(&log->l_icloglock); @@ -3215,8 +3214,6 @@ __xfs_log_force_lsn( if (!already_slept && (iclog->ic_prev->ic_state == XLOG_STATE_WANT_SYNC || iclog->ic_prev->ic_state == XLOG_STATE_SYNCING)) { - XFS_STATS_INC(mp, xs_log_force_sleep); - xlog_wait(&iclog->ic_prev->ic_write_wait, &log->l_icloglock); return -EAGAIN; @@ -3254,25 +3251,29 @@ __xfs_log_force_lsn( * to disk, that thread will wake up all threads waiting on the queue. */ int -xfs_log_force_lsn( +xfs_log_force_seq( struct xfs_mount *mp, - xfs_lsn_t lsn, + xfs_csn_t seq, uint flags, int *log_flushed) { + struct xlog *log = mp->m_log; + xfs_lsn_t lsn; int ret; - ASSERT(lsn != 0); + ASSERT(seq != 0);
XFS_STATS_INC(mp, xs_log_force); - trace_xfs_log_force(mp, lsn, _RET_IP_); + trace_xfs_log_force(mp, seq, _RET_IP_);
- lsn = xlog_cil_force_lsn(mp->m_log, lsn); + lsn = xlog_cil_force_seq(log, seq); if (lsn == NULLCOMMITLSN) return 0;
- ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, false); - if (ret == -EAGAIN) - ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, true); + ret = xlog_force_lsn(log, lsn, flags, log_flushed, false); + if (ret == -EAGAIN) { + XFS_STATS_INC(mp, xs_log_force_sleep); + ret = xlog_force_lsn(log, lsn, flags, log_flushed, true); + } return ret; }
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 7c8a775781d3..93fbb8fbbc8c 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -106,7 +106,7 @@ struct xfs_item_ops; struct xfs_trans;
int xfs_log_force(struct xfs_mount *mp, uint flags); -int xfs_log_force_lsn(struct xfs_mount *mp, xfs_lsn_t lsn, uint flags, +int xfs_log_force_seq(struct xfs_mount *mp, xfs_csn_t seq, uint flags, int *log_forced); int xfs_log_mount(struct xfs_mount *mp, struct xfs_buftarg *log_target, @@ -130,8 +130,6 @@ int xfs_log_force_umount(struct xfs_mount *mp, int logerror); struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); void xfs_log_ticket_put(struct xlog_ticket *ticket);
-void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_lsn_t *commit_lsn, bool regrant); void xlog_cil_process_committed(struct list_head *list); bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 903617e6d054..3c2b1205944d 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -788,7 +788,7 @@ xlog_cil_push_work( * that higher sequences will wait for us to write out a commit record * before they do. * - * xfs_log_force_lsn requires us to mirror the new sequence into the cil + * xfs_log_force_seq requires us to mirror the new sequence into the cil * structure atomically with the addition of this sequence to the * committing list. This also ensures that we can do unlocked checks * against the current sequence in log forces without risking @@ -1057,16 +1057,14 @@ xlog_cil_empty( * allowed again. */ void -xfs_log_commit_cil( - struct xfs_mount *mp, +xlog_cil_commit( + struct xlog *log, struct xfs_trans *tp, - xfs_lsn_t *commit_lsn, + xfs_csn_t *commit_seq, bool regrant) { - struct xlog *log = mp->m_log; struct xfs_cil *cil = log->l_cilp; struct xfs_log_item *lip, *next; - xfs_lsn_t xc_commit_lsn;
/* * Do all necessary memory allocation before we lock the CIL. @@ -1080,10 +1078,6 @@ xfs_log_commit_cil(
xlog_cil_insert_items(log, tp);
- xc_commit_lsn = cil->xc_ctx->sequence; - if (commit_lsn) - *commit_lsn = xc_commit_lsn; - if (regrant && !XLOG_FORCED_SHUTDOWN(log)) xfs_log_ticket_regrant(log, tp->t_ticket); else @@ -1106,8 +1100,10 @@ xfs_log_commit_cil( list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) { xfs_trans_del_item(lip); if (lip->li_ops->iop_committing) - lip->li_ops->iop_committing(lip, xc_commit_lsn); + lip->li_ops->iop_committing(lip, cil->xc_ctx->sequence); } + if (commit_seq) + *commit_seq = cil->xc_ctx->sequence;
/* xlog_cil_push_background() releases cil->xc_ctx_lock */ xlog_cil_push_background(log); @@ -1124,9 +1120,9 @@ xfs_log_commit_cil( * iclog flush is necessary following this call. */ xfs_lsn_t -xlog_cil_force_lsn( +xlog_cil_force_seq( struct xlog *log, - xfs_lsn_t sequence) + xfs_csn_t sequence) { struct xfs_cil *cil = log->l_cilp; struct xfs_cil_ctx *ctx; @@ -1222,21 +1218,17 @@ bool xfs_log_item_in_current_chkpt( struct xfs_log_item *lip) { - struct xfs_cil_ctx *ctx; + struct xfs_cil_ctx *ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
if (list_empty(&lip->li_cil)) return false;
- ctx = lip->li_mountp->m_log->l_cilp->xc_ctx; - /* * li_seq is written on the first commit of a log item to record the * first checkpoint it is written to. Hence if it is different to the * current sequence, we're in a new checkpoint. */ - if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0) - return false; - return true; + return lip->li_seq == ctx->sequence; }
/* diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 2203ccecafb6..2d7e7cbee8b7 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -234,7 +234,7 @@ struct xfs_cil;
struct xfs_cil_ctx { struct xfs_cil *cil; - xfs_lsn_t sequence; /* chkpt sequence # */ + xfs_csn_t sequence; /* chkpt sequence # */ xfs_lsn_t start_lsn; /* first LSN of chkpt commit */ xfs_lsn_t commit_lsn; /* chkpt commit record lsn */ struct xlog_ticket *ticket; /* chkpt ticket */ @@ -272,10 +272,10 @@ struct xfs_cil { struct xfs_cil_ctx *xc_ctx;
spinlock_t xc_push_lock ____cacheline_aligned_in_smp; - xfs_lsn_t xc_push_seq; + xfs_csn_t xc_push_seq; struct list_head xc_committing; wait_queue_head_t xc_commit_wait; - xfs_lsn_t xc_current_sequence; + xfs_csn_t xc_current_sequence; struct work_struct xc_push_work; wait_queue_head_t xc_push_wait; /* background push throttle */ } ____cacheline_aligned_in_smp; @@ -554,19 +554,18 @@ int xlog_cil_init(struct xlog *log); void xlog_cil_init_post_recovery(struct xlog *log); void xlog_cil_destroy(struct xlog *log); bool xlog_cil_empty(struct xlog *log); +void xlog_cil_commit(struct xlog *log, struct xfs_trans *tp, + xfs_csn_t *commit_seq, bool regrant);
/* * CIL force routines */ -xfs_lsn_t -xlog_cil_force_lsn( - struct xlog *log, - xfs_lsn_t sequence); +xfs_lsn_t xlog_cil_force_seq(struct xlog *log, xfs_csn_t sequence);
static inline void xlog_cil_force(struct xlog *log) { - xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence); + xlog_cil_force_seq(log, log->l_cilp->xc_current_sequence); }
/* diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index f41b16ad5b24..c19807abbefa 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -830,7 +830,7 @@ __xfs_trans_commit( bool regrant) { struct xfs_mount *mp = tp->t_mountp; - xfs_lsn_t commit_lsn = -1; + xfs_csn_t commit_seq = 0; int error = 0; int sync = tp->t_flags & XFS_TRANS_SYNC;
@@ -872,7 +872,7 @@ __xfs_trans_commit( xfs_trans_apply_sb_deltas(tp); xfs_trans_apply_dquot_deltas(tp);
- xfs_log_commit_cil(mp, tp, &commit_lsn, regrant); + xlog_cil_commit(mp->m_log, tp, &commit_seq, regrant);
current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); xfs_trans_free(tp); @@ -882,7 +882,7 @@ __xfs_trans_commit( * log out now and wait for it. */ if (sync) { - error = xfs_log_force_lsn(mp, commit_lsn, XFS_LOG_SYNC, NULL); + error = xfs_log_force_seq(mp, commit_seq, XFS_LOG_SYNC, NULL); XFS_STATS_INC(mp, xs_trans_sync); } else { XFS_STATS_INC(mp, xs_trans_async); diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 2f802d6c9fcf..f4159326411a 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -43,7 +43,7 @@ struct xfs_log_item { struct list_head li_cil; /* CIL pointers */ struct xfs_log_vec *li_lv; /* active log vector */ struct xfs_log_vec *li_lv_shadow; /* standby vector */ - xfs_lsn_t li_seq; /* CIL commit seq */ + xfs_csn_t li_seq; /* CIL commit seq */ };
/* @@ -69,7 +69,7 @@ struct xfs_item_ops { void (*iop_pin)(struct xfs_log_item *); void (*iop_unpin)(struct xfs_log_item *, int remove); uint (*iop_push)(struct xfs_log_item *, struct list_head *); - void (*iop_committing)(struct xfs_log_item *, xfs_lsn_t commit_lsn); + void (*iop_committing)(struct xfs_log_item *lip, xfs_csn_t seq); void (*iop_release)(struct xfs_log_item *); xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t); int (*iop_recover)(struct xfs_log_item *lip,
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.13-rc4 commit 81ed94751b1513fcc5978dcc06eb1f5b4e55a785 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
During regular operation, the xfs_inactive operations create transactions with zero block reservation because in general we're freeing space, not asking for more. The per-AG space reservations created at mount time enable us to handle expansions of the refcount btree without needing to reserve blocks to the transaction.
Unfortunately, log recovery doesn't create the per-AG space reservations when intent items are being recovered. This isn't an issue for intent item recovery itself because they explicitly request blocks, but any inode inactivation that can happen during log recovery uses the same xfs_inactive paths as regular runtime. If a refcount btree expansion happens, the transaction will fail due to blk_res_used > blk_res, and we shut down the filesystem unnecessarily.
Fix this problem by making per-AG reservations temporarily so that we can handle the inactivations, and releasing them at the end. This brings the recovery environment closer to the runtime environment.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_mount.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 7110507a2b6b..ed488852188f 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -927,9 +927,17 @@ xfs_mountfs( /* * Finish recovering the file system. This part needed to be delayed * until after the root and real-time bitmap inodes were consistently - * read in. + * read in. Temporarily create per-AG space reservations for metadata + * btree shape changes because space freeing transactions (for inode + * inactivation) require the per-AG reservation in lieu of reserving + * blocks. */ + error = xfs_fs_reserve_ag_blocks(mp); + if (error && error == -ENOSPC) + xfs_warn(mp, + "ENOSPC reserving per-AG metadata pool, log recovery may fail."); error = xfs_log_mount_finish(mp); + xfs_fs_unreserve_ag_blocks(mp); if (error) { xfs_warn(mp, "log mount finish failed"); goto out_rtunmount;
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.13-rc4 commit 4e6b8270c820c8c57a73f869799a0af2b56eff3e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
If any part of log intent item recovery fails, we should shut down the log immediately to stop the log from writing a clean unmount record to disk, because the metadata is not consistent. The inability to cancel a dirty transaction catches most of these cases, but there are a few things that have slipped through the cracks, such as ENOSPC from a transaction allocation, or runtime errors that result in cancellation of a non-dirty transaction.
This solves some weird behaviors reported by customers where a system goes down, the first mount fails, the second succeeds, but then the fs goes down later because of inconsistent metadata.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_log.c | 3 +++ fs/xfs/xfs_log_recover.c | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index befd46346786..f94da2c8d5d2 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -729,6 +729,9 @@ xfs_log_mount_finish( if (readonly) mp->m_flags |= XFS_MOUNT_RDONLY;
+ /* Make sure the log is dead if we're returning failure. */ + ASSERT(!error || (mp->m_log->l_flags & XLOG_IO_ERROR)); + return error; }
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 87886b7f77da..69408782019e 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2457,8 +2457,10 @@ xlog_finish_defer_ops(
error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres, dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp); - if (error) + if (error) { + xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); return error; + }
/* * Transfer to this new transaction all the dfops we captured @@ -3454,6 +3456,7 @@ xlog_recover_finish( * this) before we get around to xfs_log_mount_cancel. */ xlog_recover_cancel_intents(log); + xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); xfs_alert(log->l_mp, "Failed to recover intents"); return error; }
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.13-rc4 commit 6be001021f0b307c8c1544e8b3ac87de20d711de category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
It's completely unnecessary because callbacks are added to iclogs without holding the icloglock, hence no amount of ordering between the icloglock and ic_callback_lock will order the removal of callbacks from the iclog.
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_log.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index f94da2c8d5d2..26ace6c12e3a 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2687,11 +2687,9 @@ static void xlog_state_do_iclog_callbacks( struct xlog *log, struct xlog_in_core *iclog) - __releases(&log->l_icloglock) - __acquires(&log->l_icloglock) { - spin_unlock(&log->l_icloglock); spin_lock(&iclog->ic_callback_lock); + while (!list_empty(&iclog->ic_callbacks)) { LIST_HEAD(tmp);
@@ -2702,12 +2700,6 @@ xlog_state_do_iclog_callbacks( spin_lock(&iclog->ic_callback_lock); }
- /* - * Pick up the icloglock while still holding the callback lock so we - * serialise against anyone trying to add more callbacks to this iclog - * now we've finished processing. - */ - spin_lock(&log->l_icloglock); spin_unlock(&iclog->ic_callback_lock); }
@@ -2748,13 +2740,12 @@ xlog_state_do_callback( iclog = iclog->ic_next; continue; } + spin_unlock(&log->l_icloglock);
- /* - * Running callbacks will drop the icloglock which means - * we'll have to run at least one more complete loop. - */ - cycled_icloglock = true; xlog_state_do_iclog_callbacks(log, iclog); + cycled_icloglock = true; + + spin_lock(&log->l_icloglock); if (XLOG_FORCED_SHUTDOWN(log)) wake_up_all(&iclog->ic_force_wait); else
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.13-rc4 commit b6903358c230c517b29ecdb6123276d96cc0beab category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
If we are processing callbacks on an iclog, nothing can be concurrently adding callbacks to the loop. We only add callbacks to the iclog when they are in ACTIVE or WANT_SYNC state, and we explicitly do not add callbacks if the iclog is already in IOERROR state.
The only way to have a dequeue racing with an enqueue is to be processing a shutdown without a direct reference to an iclog in ACTIVE or WANT_SYNC state. As the enqueue avoids this race condition, we only ever need a single dequeue operation in xlog_state_do_iclog_callbacks(). Hence we can remove the loop.
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_log.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 26ace6c12e3a..a68fdae831af 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2688,19 +2688,13 @@ xlog_state_do_iclog_callbacks( struct xlog *log, struct xlog_in_core *iclog) { - spin_lock(&iclog->ic_callback_lock); - - while (!list_empty(&iclog->ic_callbacks)) { - LIST_HEAD(tmp);
- list_splice_init(&iclog->ic_callbacks, &tmp); - - spin_unlock(&iclog->ic_callback_lock); - xlog_cil_process_committed(&tmp); - spin_lock(&iclog->ic_callback_lock); - } + LIST_HEAD(tmp);
+ spin_lock(&iclog->ic_callback_lock); + list_splice_init(&iclog->ic_callbacks, &tmp); spin_unlock(&iclog->ic_callback_lock); + xlog_cil_process_committed(&tmp); }
STATIC void
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.13-rc4 commit a1bb8505e92101df94080f81298e3640f5fbe037 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
The iclog callback chain has it's own lock. That was added way back in 2008 by myself to alleviate severe lock contention on the icloglock in commit 114d23aae512 ("[XFS] Per iclog callback chain lock"). This was long before delayed logging took the icloglock out of the hot transaction commit path and removed all contention on it. Hence the separate ic_callback_lock doesn't serve any scalability purpose anymore, and hasn't for close on a decade.
Further, we only attach callbacks to iclogs in one place where we are already taking the icloglock soon after attaching the callbacks. We also have to drop the icloglock to run callbacks and grab it immediately afterwards again. So given that the icloglock is no longer hot, making it cover callbacks again doesn't really change the locking patterns very much at all.
We also need to extend the icloglock to cover callback addition to fix a zero-day UAF in the CIL push code. This occurs when shutdown races with xlog_cil_push_work() and the shutdown runs the callbacks before the push releases the iclog. This results in the CIL context structure attached to the iclog being freed by the callback before the CIL push has finished referencing it, leading to UAF bugs.
Hence, to avoid this UAF, we need the callback attachment to be atomic with post processing of the commit iclog and references to the structures being attached to the iclog. This requires holding the icloglock as that's the only way to serialise iclog state against a shutdown in progress.
The result is we need to be using the icloglock to protect the callback list addition and removal and serialise them with shutdown. That makes the ic_callback_lock redundant and so it can be removed.
Fixes: 71e330b59390 ("xfs: Introduce delayed logging core code") Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_log.c | 29 ++++------------------------- fs/xfs/xfs_log_cil.c | 16 ++++++++++++---- fs/xfs/xfs_log_priv.h | 3 --- 3 files changed, 16 insertions(+), 32 deletions(-)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index a68fdae831af..8479df31cb43 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1403,7 +1403,6 @@ xlog_alloc_log( iclog->ic_state = XLOG_STATE_ACTIVE; iclog->ic_log = log; atomic_set(&iclog->ic_refcnt, 0); - spin_lock_init(&iclog->ic_callback_lock); INIT_LIST_HEAD(&iclog->ic_callbacks); iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
@@ -2674,29 +2673,6 @@ xlog_state_iodone_process_iclog( } }
-/* - * Keep processing entries in the iclog callback list until we come around and - * it is empty. We need to atomically see that the list is empty and change the - * state to DIRTY so that we don't miss any more callbacks being added. - * - * This function is called with the icloglock held and returns with it held. We - * drop it while running callbacks, however, as holding it over thousands of - * callbacks is unnecessary and causes excessive contention if we do. - */ -static void -xlog_state_do_iclog_callbacks( - struct xlog *log, - struct xlog_in_core *iclog) -{ - - LIST_HEAD(tmp); - - spin_lock(&iclog->ic_callback_lock); - list_splice_init(&iclog->ic_callbacks, &tmp); - spin_unlock(&iclog->ic_callback_lock); - xlog_cil_process_committed(&tmp); -} - STATIC void xlog_state_do_callback( struct xlog *log) @@ -2725,6 +2701,8 @@ xlog_state_do_callback( repeats++;
do { + LIST_HEAD(cb_list); + if (xlog_state_iodone_process_iclog(log, iclog, &ioerror)) break; @@ -2734,9 +2712,10 @@ xlog_state_do_callback( iclog = iclog->ic_next; continue; } + list_splice_init(&iclog->ic_callbacks, &cb_list); spin_unlock(&log->l_icloglock);
- xlog_state_do_iclog_callbacks(log, iclog); + xlog_cil_process_committed(&cb_list); cycled_icloglock = true;
spin_lock(&log->l_icloglock); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 3c2b1205944d..db03f6f7b5a4 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -873,15 +873,21 @@ xlog_cil_push_work(
xfs_log_ticket_ungrant(log, tic);
- spin_lock(&commit_iclog->ic_callback_lock); + /* + * Once we attach the ctx to the iclog, a shutdown can process the + * iclog, run the callbacks and free the ctx. The only thing preventing + * this potential UAF situation here is that we are holding the + * icloglock. Hence we cannot access the ctx after we have attached the + * callbacks and dropped the icloglock. + */ + spin_lock(&log->l_icloglock); if (commit_iclog->ic_state == XLOG_STATE_IOERROR) { - spin_unlock(&commit_iclog->ic_callback_lock); + spin_unlock(&log->l_icloglock); goto out_abort; } ASSERT_ALWAYS(commit_iclog->ic_state == XLOG_STATE_ACTIVE || commit_iclog->ic_state == XLOG_STATE_WANT_SYNC); list_add_tail(&ctx->iclog_entry, &commit_iclog->ic_callbacks); - spin_unlock(&commit_iclog->ic_callback_lock);
/* * now the checkpoint commit is complete and we've attached the @@ -898,8 +904,10 @@ xlog_cil_push_work( * iclogs to complete before we submit the commit_iclog. In this case, * the commit_iclog write needs to issue a pre-flush so that the * ordering is correctly preserved down to stable storage. + * + * NOTE: It is not safe to reference the ctx after this check as we drop + * the icloglock if we have to wait for completion of other iclogs. */ - spin_lock(&log->l_icloglock); if (ctx->start_lsn != commit_lsn) { xlog_wait_on_iclog(commit_iclog->ic_prev); spin_lock(&log->l_icloglock); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 2d7e7cbee8b7..138441ab9690 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -206,9 +206,6 @@ typedef struct xlog_in_core { enum xlog_iclog_state ic_state; unsigned int ic_flags; char *ic_datap; /* pointer to iclog data */ - - /* Callback structures need their own cacheline */ - spinlock_t ic_callback_lock ____cacheline_aligned_in_smp; struct list_head ic_callbacks;
/* reference counts need their own cacheline */
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.13-rc4 commit 1effb72a8179a02c2dd8a268454ccf50bf68aa50 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
The iclogbuf ring attached to the struct xlog is circular, hence the first and last iclogs in the ring can only be determined by comparing them against the log->l_iclog pointer.
In xfs_cil_push_work(), we want to wait on previous iclogs that were issued so that we can flush them to stable storage with the commit record write, and it simply waits on the previous iclog in the ring. This, however, leads to CIL push hangs in generic/019 like so:
task:kworker/u33:0 state:D stack:12680 pid: 7 ppid: 2 flags:0x00004000 Workqueue: xfs-cil/pmem1 xlog_cil_push_work Call Trace: __schedule+0x30b/0x9f0 schedule+0x68/0xe0 xlog_wait_on_iclog+0x121/0x190 ? wake_up_q+0xa0/0xa0 xlog_cil_push_work+0x994/0xa10 ? _raw_spin_lock+0x15/0x20 ? xfs_swap_extents+0x920/0x920 process_one_work+0x1ab/0x390 worker_thread+0x56/0x3d0 ? rescuer_thread+0x3c0/0x3c0 kthread+0x14d/0x170 ? __kthread_bind_mask+0x70/0x70 ret_from_fork+0x1f/0x30
With other threads blocking in either xlog_state_get_iclog_space() waiting for iclog space or xlog_grant_head_wait() waiting for log reservation space.
The problem here is that the previous iclog on the ring might actually be a future iclog. That is, if log->l_iclog points at commit_iclog, commit_iclog is the first (oldest) iclog in the ring and there are no previous iclogs pending as they have all completed their IO and been activated again. IOWs, commit_iclog->ic_prev points to an iclog that will be written in the future, not one that has been written in the past.
Hence, in this case, waiting on the ->ic_prev iclog is incorrect behaviour, and depending on the state of the future iclog, we can end up with a circular ABA wait cycle and we hang.
The fix is made more complex by the fact that many iclogs states cannot be used to determine if the iclog is a past or future iclog. Hence we have to determine past iclogs by checking the LSN of the iclog rather than their state. A past ACTIVE iclog will have a LSN of zero, while a future ACTIVE iclog will have a LSN greater than the current iclog. We don't wait on either of these cases.
Similarly, a future iclog that hasn't completed IO will have an LSN greater than the current iclog and so we don't wait on them. A past iclog that is still undergoing IO completion will have a LSN less than the current iclog and those are the only iclogs that we need to wait on.
Hence we can use the iclog LSN to determine what iclogs we need to wait on here.
Fixes: 5fd9256ce156 ("xfs: separate CIL commit record IO") Reported-by: Brian Foster bfoster@redhat.com Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_log_cil.c | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-)
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index db03f6f7b5a4..b128aaa9b870 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -877,7 +877,7 @@ xlog_cil_push_work( * Once we attach the ctx to the iclog, a shutdown can process the * iclog, run the callbacks and free the ctx. The only thing preventing * this potential UAF situation here is that we are holding the - * icloglock. Hence we cannot access the ctx after we have attached the + * icloglock. Hence we cannot access the ctx once we have attached the * callbacks and dropped the icloglock. */ spin_lock(&log->l_icloglock); @@ -900,17 +900,38 @@ xlog_cil_push_work( spin_unlock(&cil->xc_push_lock);
/* - * If the checkpoint spans multiple iclogs, wait for all previous - * iclogs to complete before we submit the commit_iclog. In this case, - * the commit_iclog write needs to issue a pre-flush so that the - * ordering is correctly preserved down to stable storage. + * If the checkpoint spans multiple iclogs, wait for all previous iclogs + * to complete before we submit the commit_iclog. We can't use state + * checks for this - ACTIVE can be either a past completed iclog or a + * future iclog being filled, while WANT_SYNC through SYNC_DONE can be a + * past or future iclog awaiting IO or ordered IO completion to be run. + * In the latter case, if it's a future iclog and we wait on it, the we + * will hang because it won't get processed through to ic_force_wait + * wakeup until this commit_iclog is written to disk. Hence we use the + * iclog header lsn and compare it to the commit lsn to determine if we + * need to wait on iclogs or not. * * NOTE: It is not safe to reference the ctx after this check as we drop * the icloglock if we have to wait for completion of other iclogs. */ if (ctx->start_lsn != commit_lsn) { - xlog_wait_on_iclog(commit_iclog->ic_prev); - spin_lock(&log->l_icloglock); + xfs_lsn_t plsn; + + plsn = be64_to_cpu(commit_iclog->ic_prev->ic_header.h_lsn); + if (plsn && XFS_LSN_CMP(plsn, commit_lsn) < 0) { + /* + * Waiting on ic_force_wait orders the completion of + * iclogs older than ic_prev. Hence we only need to wait + * on the most recent older iclog here. + */ + xlog_wait_on_iclog(commit_iclog->ic_prev); + spin_lock(&log->l_icloglock); + } + + /* + * We need to issue a pre-flush so that the ordering for this + * checkpoint is correctly preserved down to stable storage. + */ commit_iclog->ic_flags |= XLOG_ICL_NEED_FLUSH; }
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.14-rc1 commit 5838d0356bb3c320867c393f12b169c01a870bda category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
While running xfs/168, I noticed a second source of post-shrink corruption errors causing shutdowns.
Let's say that directory B has a low inode number and is a child of directory A, which has a high number. If B is empty but open, and unlinked from A, B's dotdot link continues to point to A. If A is then unlinked and the filesystem shrunk so that A is no longer a valid inode, a subsequent AIL push of B will trip the inode verifiers because the dotdot entry points outside of the filesystem.
To avoid this problem, reset B's dotdot entry to the root directory when unlinking directories, since the root directory cannot be removed.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Gao Xiang hsiangkao@linux.alibaba.com Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_inode.c | 13 +++++++++++++ 1 file changed, 13 insertions(+)
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 4db3c0b7d71b..36c5d212bdde 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2884,6 +2884,19 @@ xfs_remove( error = xfs_droplink(tp, ip); if (error) goto out_trans_cancel; + + /* + * Point the unlinked child directory's ".." entry to the root + * directory to eliminate back-references to inodes that may + * get freed before the child directory is closed. If the fs + * gets shrunk, this can lead to dirent inode validation errors. + */ + if (dp->i_ino != tp->t_mountp->m_sb.sb_rootino) { + error = xfs_dir_replace(tp, ip, &xfs_name_dotdot, + tp->t_mountp->m_sb.sb_rootino, 0); + if (error) + return error; + } } else { /* * When removing a non-directory we need to log the parent
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.14-rc1 commit 0925fecc557471b6f6a488c3590a275151210572 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
During a realtime grow operation, we run a single transaction for each rt bitmap block added to the filesystem. This means that each step has to be careful to increase sb_rblocks appropriately.
Fix the integer overflow error in this calculation that can happen when the extent size is very large. Found by running growfs to add a rt volume to a filesystem formatted with a 1g rt extent size.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_rtalloc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index ede1baf31413..35aa62625bf3 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -991,7 +991,8 @@ xfs_growfs_rt( ((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0); bmbno < nrbmblocks; bmbno++) { - xfs_trans_t *tp; + struct xfs_trans *tp; + xfs_rfsblock_t nrblocks_step;
*nmp = *mp; nsbp = &nmp->m_sb; @@ -1000,10 +1001,9 @@ xfs_growfs_rt( */ nsbp->sb_rextsize = in->extsize; nsbp->sb_rbmblocks = bmbno + 1; - nsbp->sb_rblocks = - XFS_RTMIN(nrblocks, - nsbp->sb_rbmblocks * NBBY * - nsbp->sb_blocksize * nsbp->sb_rextsize); + nrblocks_step = (bmbno + 1) * NBBY * nsbp->sb_blocksize * + nsbp->sb_rextsize; + nsbp->sb_rblocks = min(nrblocks, nrblocks_step); nsbp->sb_rextents = nsbp->sb_rblocks; do_div(nsbp->sb_rextents, nsbp->sb_rextsize); ASSERT(nsbp->sb_rextents != 0);
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.14-rc1 commit b1e27239b9169f07edba0ca0e52805645a1768ba category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
We incorrectly flush the log device instead of the data device when trying to ensure metadata is correctly on disk before writing the unmount record.
Fixes: eef983ffeae7 ("xfs: journal IO cache flush reductions") Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 8479df31cb43..fed7a6c3d23e 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -806,7 +806,7 @@ xlog_write_unmount_record( * stamp the tail LSN into the unmount record. */ if (log->l_targ != log->l_mp->m_ddev_targp) - xfs_blkdev_issue_flush(log->l_targ); + xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp); return xlog_write(log, &vec, ticket, NULL, NULL, XLOG_UNMOUNT_TRANS); }
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.14-rc1 commit b5d721eaae47eaa4b4c2754699dadacc4cbca2e0 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
The recent journal flush/FUA changes replaced the flushing of the data device on every iclog write with an up-front async data device cache flush. Unfortunately, the assumption of which this was based on has been proven incorrect by the flush vs log tail update ordering issue. As the fix for that issue uses the XLOG_ICL_NEED_FLUSH flag to indicate that data device needs a cache flush, we now need to (once again) ensure that an iclog write to external logs that need a cache flush to be issued actually issue a cache flush to the data device as well as the log device.
Fixes: eef983ffeae7 ("xfs: journal IO cache flush reductions") Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_log.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index fed7a6c3d23e..47ff347418a3 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -800,13 +800,6 @@ xlog_write_unmount_record( /* account for space used by record data */ ticket->t_curr_res -= sizeof(ulf);
- /* - * For external log devices, we need to flush the data device cache - * first to ensure all metadata writeback is on stable storage before we - * stamp the tail LSN into the unmount record. - */ - if (log->l_targ != log->l_mp->m_ddev_targp) - xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp); return xlog_write(log, &vec, ticket, NULL, NULL, XLOG_UNMOUNT_TRANS); }
@@ -1713,10 +1706,20 @@ xlog_write_iclog( * metadata writeback and causing priority inversions. */ iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_IDLE; - if (iclog->ic_flags & XLOG_ICL_NEED_FLUSH) + if (iclog->ic_flags & XLOG_ICL_NEED_FLUSH) { iclog->ic_bio.bi_opf |= REQ_PREFLUSH; + /* + * For external log devices, we also need to flush the data + * device cache first to ensure all metadata writeback covered + * by the LSN in this iclog is on stable storage. This is slow, + * but it *must* complete before we issue the external log IO. + */ + if (log->l_targ != log->l_mp->m_ddev_targp) + xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp); + } if (iclog->ic_flags & XLOG_ICL_NEED_FUA) iclog->ic_bio.bi_opf |= REQ_FUA; + iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA);
if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) {
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.14-rc1 commit 9d3920644081edf311878b56e0c1e1477991a195 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Fold __xlog_state_release_iclog into its only caller to prepare make an upcoming fix easier.
Signed-off-by: Dave Chinner dchinner@redhat.com [hch: split from a larger patch] Signed-off-by: Christoph Hellwig hch@lst.de Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_log.c | 43 ++++++++++++++++--------------------------- 1 file changed, 16 insertions(+), 27 deletions(-)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 47ff347418a3..5df392387c29 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -463,28 +463,6 @@ xfs_log_reserve( return error; }
-static bool -__xlog_state_release_iclog( - struct xlog *log, - struct xlog_in_core *iclog) -{ - lockdep_assert_held(&log->l_icloglock); - - if (iclog->ic_state == XLOG_STATE_WANT_SYNC) { - /* update tail before writing to iclog */ - xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp); - - iclog->ic_state = XLOG_STATE_SYNCING; - iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); - xlog_verify_tail_lsn(log, iclog, tail_lsn); - /* cycle incremented when incrementing curr_block */ - return true; - } - - ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); - return false; -} - /* * Flush iclog to disk if this is the last reference to the given iclog and the * it is in the WANT_SYNC state. @@ -494,18 +472,29 @@ xlog_state_release_iclog( struct xlog *log, struct xlog_in_core *iclog) { + xfs_lsn_t tail_lsn; lockdep_assert_held(&log->l_icloglock);
if (iclog->ic_state == XLOG_STATE_IOERROR) return -EIO;
- if (atomic_dec_and_test(&iclog->ic_refcnt) && - __xlog_state_release_iclog(log, iclog)) { - spin_unlock(&log->l_icloglock); - xlog_sync(log, iclog); - spin_lock(&log->l_icloglock); + if (!atomic_dec_and_test(&iclog->ic_refcnt)) + return 0; + + if (iclog->ic_state != XLOG_STATE_WANT_SYNC) { + ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); + return 0; }
+ /* update tail before writing to iclog */ + tail_lsn = xlog_assign_tail_lsn(log->l_mp); + iclog->ic_state = XLOG_STATE_SYNCING; + iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); + xlog_verify_tail_lsn(log, iclog, tail_lsn); + + spin_unlock(&log->l_icloglock); + xlog_sync(log, iclog); + spin_lock(&log->l_icloglock); return 0; }
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.14-rc1 commit 0dc8f7f139f07aaca1afcec0ade5718c4ebba91e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
There is a race between the new CIL async data device metadata IO completion cache flush and the log tail in the iclog the flush covers being updated. This can be seen by repeating generic/482 in a loop and eventually log recovery fails with a failures such as this:
XFS (dm-3): Starting recovery (logdev: internal) XFS (dm-3): bad inode magic/vsn daddr 228352 #0 (magic=0) XFS (dm-3): Metadata corruption detected at xfs_inode_buf_verify+0x180/0x190, xfs_inode block 0x37c00 xfs_inode_buf_verify XFS (dm-3): Unmount and run xfs_repair XFS (dm-3): First 128 bytes of corrupted metadata buffer: 00000000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00000020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00000030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00000040: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00000050: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00000060: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00000070: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ XFS (dm-3): metadata I/O error in "xlog_recover_items_pass2+0x55/0xc0" at daddr 0x37c00 len 32 error 117
Analysis of the logwrite replay shows that there were no writes to the data device between the FUA @ write 124 and the FUA at write @ 125, but log recovery @ 125 failed. The difference was the one log write @ 125 moved the tail of the log forwards from (1,8) to (1,32) and so the inode create intent in (1,8) was not replayed and so the inode cluster was zero on disk when replay of the first inode item in (1,32) was attempted.
What this meant was that the journal write that occurred at @ 125 did not ensure that metadata completed before the iclog was written was correctly on stable storage. The tail of the log moved forward, so IO must have been completed between the two iclog writes. This means that there is a race condition between the unconditional async cache flush in the CIL push work and the tail LSN that is written to the iclog. This happens like so:
CIL push work AIL push work ------------- ------------- Add to committing list start async data dev cache flush ..... <flush completes> <all writes to old tail lsn are stable> xlog_write .... push inode create buffer <start IO> ..... xlog_write(commit record) .... <IO completes> log tail moves xlog_assign_tail_lsn() start_lsn == commit_lsn <no iclog preflush!> xlog_state_release_iclog __xlog_state_release_iclog() <writes *new* tail_lsn into iclog> xlog_sync() .... submit_bio() <tail in log moves forward without flushing written metadata>
Essentially, this can only occur if the commit iclog is issued without a cache flush. If the iclog bio is submitted with REQ_PREFLUSH, then it will guarantee that all the completed IO is one stable storage before the iclog bio with the new tail LSN in it is written to the log.
IOWs, the tail lsn that is written to the iclog needs to be sampled *before* we issue the cache flush that guarantees all IO up to that LSN has been completed.
To fix this without giving up the performance advantage of the flush/FUA optimisations (e.g. g/482 runtime halves with 5.14-rc1 compared to 5.13), we need to ensure that we always issue a cache flush if the tail LSN changes between the initial async flush and the commit record being written. THis requires sampling the tail_lsn before we start the flush, and then passing the sampled tail LSN to xlog_state_release_iclog() so it can determine if the the tail LSN has changed while writing the checkpoint. If the tail LSN has changed, then it needs to set the NEED_FLUSH flag on the iclog and we'll issue another cache flush before writing the iclog.
Fixes: eef983ffeae7 ("xfs: journal IO cache flush reductions") Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_log.c | 36 ++++++++++++++++++++++++++---------- fs/xfs/xfs_log_cil.c | 13 +++++++++++-- fs/xfs/xfs_log_priv.h | 3 ++- 3 files changed, 39 insertions(+), 13 deletions(-)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 5df392387c29..625db909f96a 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -465,12 +465,17 @@ xfs_log_reserve(
/* * Flush iclog to disk if this is the last reference to the given iclog and the - * it is in the WANT_SYNC state. + * it is in the WANT_SYNC state. If the caller passes in a non-zero + * @old_tail_lsn and the current log tail does not match, there may be metadata + * on disk that must be persisted before this iclog is written. To satisfy that + * requirement, set the XLOG_ICL_NEED_FLUSH flag as a condition for writing this + * iclog with the new log tail value. */ int xlog_state_release_iclog( struct xlog *log, - struct xlog_in_core *iclog) + struct xlog_in_core *iclog, + xfs_lsn_t old_tail_lsn) { xfs_lsn_t tail_lsn; lockdep_assert_held(&log->l_icloglock); @@ -478,6 +483,19 @@ xlog_state_release_iclog( if (iclog->ic_state == XLOG_STATE_IOERROR) return -EIO;
+ /* + * Grabbing the current log tail needs to be atomic w.r.t. the writing + * of the tail LSN into the iclog so we guarantee that the log tail does + * not move between deciding if a cache flush is required and writing + * the LSN into the iclog below. + */ + if (old_tail_lsn || iclog->ic_state == XLOG_STATE_WANT_SYNC) { + tail_lsn = xlog_assign_tail_lsn(log->l_mp); + + if (old_tail_lsn && tail_lsn != old_tail_lsn) + iclog->ic_flags |= XLOG_ICL_NEED_FLUSH; + } + if (!atomic_dec_and_test(&iclog->ic_refcnt)) return 0;
@@ -486,8 +504,6 @@ xlog_state_release_iclog( return 0; }
- /* update tail before writing to iclog */ - tail_lsn = xlog_assign_tail_lsn(log->l_mp); iclog->ic_state = XLOG_STATE_SYNCING; iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); xlog_verify_tail_lsn(log, iclog, tail_lsn); @@ -831,7 +847,7 @@ xlog_unmount_write( * iclog containing the unmount record is written. */ iclog->ic_flags |= (XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA); - error = xlog_state_release_iclog(log, iclog); + error = xlog_state_release_iclog(log, iclog, 0); xlog_wait_on_iclog(iclog);
if (tic) { @@ -2219,7 +2235,7 @@ xlog_write_copy_finish( return 0;
release_iclog: - error = xlog_state_release_iclog(log, iclog); + error = xlog_state_release_iclog(log, iclog, 0); spin_unlock(&log->l_icloglock); return error; } @@ -2438,7 +2454,7 @@ xlog_write( ASSERT(optype & XLOG_COMMIT_TRANS); *commit_iclog = iclog; } else { - error = xlog_state_release_iclog(log, iclog); + error = xlog_state_release_iclog(log, iclog, 0); } spin_unlock(&log->l_icloglock);
@@ -2867,7 +2883,7 @@ xlog_state_get_iclog_space( * reference to the iclog. */ if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) - error = xlog_state_release_iclog(log, iclog); + error = xlog_state_release_iclog(log, iclog, 0); spin_unlock(&log->l_icloglock); if (error) return error; @@ -3100,7 +3116,7 @@ xfs_log_force( atomic_inc(&iclog->ic_refcnt); lsn = be64_to_cpu(iclog->ic_header.h_lsn); xlog_state_switch_iclogs(log, iclog, 0); - if (xlog_state_release_iclog(log, iclog)) + if (xlog_state_release_iclog(log, iclog, 0)) goto out_error;
if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) @@ -3179,7 +3195,7 @@ xlog_force_lsn( } atomic_inc(&iclog->ic_refcnt); xlog_state_switch_iclogs(log, iclog, 0); - if (xlog_state_release_iclog(log, iclog)) + if (xlog_state_release_iclog(log, iclog, 0)) goto out_error; if (log_flushed) *log_flushed = 1; diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index b128aaa9b870..4c44bc3786c0 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -654,8 +654,9 @@ xlog_cil_push_work( struct xfs_trans_header thdr; struct xfs_log_iovec lhdr; struct xfs_log_vec lvhdr = { NULL }; + xfs_lsn_t preflush_tail_lsn; xfs_lsn_t commit_lsn; - xfs_lsn_t push_seq; + xfs_csn_t push_seq; struct bio bio; DECLARE_COMPLETION_ONSTACK(bdev_flush);
@@ -730,7 +731,15 @@ xlog_cil_push_work( * because we hold the flush lock exclusively. Hence we can now issue * a cache flush to ensure all the completed metadata in the journal we * are about to overwrite is on stable storage. + * + * Because we are issuing this cache flush before we've written the + * tail lsn to the iclog, we can have metadata IO completions move the + * tail forwards between the completion of this flush and the iclog + * being written. In this case, we need to re-issue the cache flush + * before the iclog write. To detect whether the log tail moves, sample + * the tail LSN *before* we issue the flush. */ + preflush_tail_lsn = atomic64_read(&log->l_tail_lsn); xfs_flush_bdev_async(&bio, log->l_mp->m_ddev_targp->bt_bdev, &bdev_flush);
@@ -941,7 +950,7 @@ xlog_cil_push_work( * storage. */ commit_iclog->ic_flags |= XLOG_ICL_NEED_FUA; - xlog_state_release_iclog(log, commit_iclog); + xlog_state_release_iclog(log, commit_iclog, preflush_tail_lsn); spin_unlock(&log->l_icloglock); return;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 138441ab9690..2c1ae43ac667 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -487,7 +487,8 @@ int xlog_commit_record(struct xlog *log, struct xlog_ticket *ticket, void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket); void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket);
-int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog); +int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog, + xfs_lsn_t log_tail_lsn);
/* * When we crack an atomic LSN, we sample it first so that the value will not
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.14-rc1 commit 45eddb414047c366744cc60dd6cef7c7e58c6ab9 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
We force iclogs in several places - we need them all to have the same cache flush semantics, so start by factoring out the iclog force into a common helper.
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_log.c | 42 ++++++++++++++++++------------------------ 1 file changed, 18 insertions(+), 24 deletions(-)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 625db909f96a..52b23ad81186 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -752,6 +752,20 @@ xfs_log_mount_cancel( xfs_log_unmount(mp); }
+/* + * Flush out the iclog to disk ensuring that device caches are flushed and + * the iclog hits stable storage before any completion waiters are woken. + */ +static inline int +xlog_force_iclog( + struct xlog_in_core *iclog) +{ + atomic_inc(&iclog->ic_refcnt); + if (iclog->ic_state == XLOG_STATE_ACTIVE) + xlog_state_switch_iclogs(iclog->ic_log, iclog, 0); + return xlog_state_release_iclog(iclog->ic_log, iclog, 0); +} + /* * Wait for the iclog and all prior iclogs to be written disk as required by the * log force state machine. Waiting on ic_force_wait ensures iclog completions @@ -836,18 +850,8 @@ xlog_unmount_write(
spin_lock(&log->l_icloglock); iclog = log->l_iclog; - atomic_inc(&iclog->ic_refcnt); - if (iclog->ic_state == XLOG_STATE_ACTIVE) - xlog_state_switch_iclogs(log, iclog, 0); - else - ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || - iclog->ic_state == XLOG_STATE_IOERROR); - /* - * Ensure the journal is fully flushed and on stable storage once the - * iclog containing the unmount record is written. - */ iclog->ic_flags |= (XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA); - error = xlog_state_release_iclog(log, iclog, 0); + error = xlog_force_iclog(iclog); xlog_wait_on_iclog(iclog);
if (tic) { @@ -3106,17 +3110,9 @@ xfs_log_force( iclog = iclog->ic_prev; } else if (iclog->ic_state == XLOG_STATE_ACTIVE) { if (atomic_read(&iclog->ic_refcnt) == 0) { - /* - * We are the only one with access to this iclog. - * - * Flush it out now. There should be a roundoff of zero - * to show that someone has already taken care of the - * roundoff from the previous sync. - */ - atomic_inc(&iclog->ic_refcnt); + /* We have exclusive access to this iclog. */ lsn = be64_to_cpu(iclog->ic_header.h_lsn); - xlog_state_switch_iclogs(log, iclog, 0); - if (xlog_state_release_iclog(log, iclog, 0)) + if (xlog_force_iclog(iclog)) goto out_error;
if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) @@ -3193,9 +3189,7 @@ xlog_force_lsn( &log->l_icloglock); return -EAGAIN; } - atomic_inc(&iclog->ic_refcnt); - xlog_state_switch_iclogs(log, iclog, 0); - if (xlog_state_release_iclog(log, iclog, 0)) + if (xlog_force_iclog(iclog)) goto out_error; if (log_flushed) *log_flushed = 1;
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.14-rc1 commit 2bf1ec0ff067ff8f692d261b29c713f3583f7e2a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
After fixing the tail_lsn vs cache flush race, generic/482 continued to fail in a similar way where cache flushes were missing before iclog FUA writes. Tracing of iclog state changes during the fsstress workload portion of the test (via xlog_iclog* events) indicated that iclog writes were coming from two sources - CIL pushes and log forces (due to fsync/O_SYNC operations). All of the cases where a recovery problem was triggered indicated that the log force was the source of the iclog write that was not preceeded by a cache flush.
This was an oversight in the modifications made in commit eef983ffeae7 ("xfs: journal IO cache flush reductions"). Log forces for fsync imply a data device cache flush has been issued if an iclog was flushed to disk and is indicated to the caller via the log_flushed parameter so they can elide the device cache flush if the journal issued one.
The change in eef983ffeae7 results in iclogs only issuing a cache flush if XLOG_ICL_NEED_FLUSH is set on the iclog, but this was not added to the iclogs that the log force code flushes to disk. Hence log forces are no longer guaranteeing that a cache flush is issued, hence opening up a potential on-disk ordering failure.
Log forces should also set XLOG_ICL_NEED_FUA as well to ensure that the actual iclogs it forces to the journal are also on stable storage before it returns to the caller.
This patch introduces the xlog_force_iclog() helper function to encapsulate the process of taking a reference to an iclog, switching its state if WANT_SYNC and flushing it to stable storage correctly.
Both xfs_log_force() and xfs_log_force_lsn() are converted to use it, as is xlog_unmount_write() which has an elaborate method of doing exactly the same "write this iclog to stable storage" operation.
Further, if the log force code needs to wait on a iclog in the WANT_SYNC state, it needs to ensure that iclog also results in a cache flush being issued. This covers the case where the iclog contains the commit record of the CIL flush that the log force triggered, but it hasn't been written yet because there is still an active reference to the iclog.
Note: this whole cache flush whack-a-mole patch is a result of log forces still being iclog state centric rather than being CIL sequence centric. Most of this nasty code will go away in future when log forces are converted to wait on CIL sequence push completion rather than iclog completion. With the CIL push algorithm guaranteeing that the CIL checkpoint is fully on stable storage when it completes, we no longer need to iterate iclogs and push them to ensure a CIL sequence push has completed and so all this nasty iclog iteration and flushing code will go away.
Fixes: eef983ffeae7 ("xfs: journal IO cache flush reductions") Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_log.c | 47 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 13 deletions(-)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 52b23ad81186..fa9408e965a7 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -761,6 +761,7 @@ xlog_force_iclog( struct xlog_in_core *iclog) { atomic_inc(&iclog->ic_refcnt); + iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA; if (iclog->ic_state == XLOG_STATE_ACTIVE) xlog_state_switch_iclogs(iclog->ic_log, iclog, 0); return xlog_state_release_iclog(iclog->ic_log, iclog, 0); @@ -850,7 +851,6 @@ xlog_unmount_write(
spin_lock(&log->l_icloglock); iclog = log->l_iclog; - iclog->ic_flags |= (XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA); error = xlog_force_iclog(iclog); xlog_wait_on_iclog(iclog);
@@ -3119,22 +3119,23 @@ xfs_log_force( goto out_unlock; } else { /* - * Someone else is writing to this iclog. - * - * Use its call to flush out the data. However, the - * other thread may not force out this LR, so we mark - * it WANT_SYNC. + * Someone else is still writing to this iclog, so we + * need to ensure that when they release the iclog it + * gets synced immediately as we may be waiting on it. */ xlog_state_switch_iclogs(log, iclog, 0); } - } else { - /* - * If the head iclog is not active nor dirty, we just attach - * ourselves to the head and go to sleep if necessary. - */ - ; }
+ /* + * The iclog we are about to wait on may contain the checkpoint pushed + * by the above xlog_cil_force() call, but it may not have been pushed + * to disk yet. Like the ACTIVE case above, we need to make sure caches + * are flushed when this iclog is written. + */ + if (iclog->ic_state == XLOG_STATE_WANT_SYNC) + iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA; + if (flags & XFS_LOG_SYNC) return xlog_wait_on_iclog(iclog); out_unlock: @@ -3166,7 +3167,8 @@ xlog_force_lsn( goto out_unlock; }
- if (iclog->ic_state == XLOG_STATE_ACTIVE) { + switch (iclog->ic_state) { + case XLOG_STATE_ACTIVE: /* * We sleep here if we haven't already slept (e.g. this is the * first time we've looked at the correct iclog buf) and the @@ -3193,6 +3195,25 @@ xlog_force_lsn( goto out_error; if (log_flushed) *log_flushed = 1; + break; + case XLOG_STATE_WANT_SYNC: + /* + * This iclog may contain the checkpoint pushed by the + * xlog_cil_force_seq() call, but there are other writers still + * accessing it so it hasn't been pushed to disk yet. Like the + * ACTIVE case above, we need to make sure caches are flushed + * when this iclog is written. + */ + iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA; + break; + default: + /* + * The entire checkpoint was written by the CIL force and is on + * its way to disk already. It will be stable when it + * completes, so we don't need to manipulate caches here at all. + * We just need to wait for completion if necessary. + */ + break; }
if (flags & XFS_LOG_SYNC)
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.14-rc1 commit 8191d8222c514c69a8e1ac46bd9812b9e0aab7d0 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
------------------------------------------------- xfs: avoid unnecessary waits in xfs_log_force_lsn()
Before waiting on a iclog in xfs_log_force_lsn(), we don't check to see if the iclog has already been completed and the contents on stable storage. We check for completed iclogs in xfs_log_force(), so we should do the same thing for xfs_log_force_lsn().
This fixed some random up-to-30s pauses seen in unmounting filesystems in some tests. A log force ends up waiting on completed iclog, and that doesn't then get flushed (and hence the log force get completed) until the background log worker issues a log force that flushes the iclog in question. Then the unmount unblocks and continues.
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_log.c | 42 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 5 deletions(-)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index fa9408e965a7..447a8903dd06 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3050,6 +3050,35 @@ xlog_state_switch_iclogs( log->l_iclog = iclog->ic_next; }
+/* + * Force the iclog to disk and check if the iclog has been completed before + * xlog_force_iclog() returns. This can happen on synchronous (e.g. + * pmem) or fast async storage because we drop the icloglock to issue the IO. + * If completion has already occurred, tell the caller so that it can avoid an + * unnecessary wait on the iclog. + */ +static int +xlog_force_and_check_iclog( + struct xlog_in_core *iclog, + bool *completed) +{ + xfs_lsn_t lsn = be64_to_cpu(iclog->ic_header.h_lsn); + int error; + + *completed = false; + error = xlog_force_iclog(iclog); + if (error) + return error; + + /* + * If the iclog has already been completed and reused the header LSN + * will have been rewritten by completion + */ + if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) + *completed = true; + return 0; +} + /* * Write out all data in the in-core log as of this exact moment in time. * @@ -3084,7 +3113,6 @@ xfs_log_force( { struct xlog *log = mp->m_log; struct xlog_in_core *iclog; - xfs_lsn_t lsn;
XFS_STATS_INC(mp, xs_log_force); trace_xfs_log_force(mp, 0, _RET_IP_); @@ -3111,11 +3139,12 @@ xfs_log_force( } else if (iclog->ic_state == XLOG_STATE_ACTIVE) { if (atomic_read(&iclog->ic_refcnt) == 0) { /* We have exclusive access to this iclog. */ - lsn = be64_to_cpu(iclog->ic_header.h_lsn); - if (xlog_force_iclog(iclog)) + bool completed; + + if (xlog_force_and_check_iclog(iclog, &completed)) goto out_error;
- if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) + if (completed) goto out_unlock; } else { /* @@ -3155,6 +3184,7 @@ xlog_force_lsn( bool already_slept) { struct xlog_in_core *iclog; + bool completed;
spin_lock(&log->l_icloglock); iclog = log->l_iclog; @@ -3191,10 +3221,12 @@ xlog_force_lsn( &log->l_icloglock); return -EAGAIN; } - if (xlog_force_iclog(iclog)) + if (xlog_force_and_check_iclog(iclog, &completed)) goto out_error; if (log_flushed) *log_flushed = 1; + if (completed) + goto out_unlock; break; case XLOG_STATE_WANT_SYNC: /*
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.14-rc1 commit 32baa63d82ee3f5ab3bd51bae6bf7d1c15aed8c7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
When we log an inode, we format the "log inode" core and set an LSN in that inode core. We do that via xfs_inode_item_format_core(), which calls:
xfs_inode_to_log_dinode(ip, dic, ip->i_itemp->ili_item.li_lsn);
to format the log inode. It writes the LSN from the inode item into the log inode, and if recovery decides the inode item needs to be replayed, it recovers the log inode LSN field and writes it into the on disk inode LSN field.
Now this might seem like a reasonable thing to do, but it is wrong on multiple levels. Firstly, if the item is not yet in the AIL, item->li_lsn is zero. i.e. the first time the inode it is logged and formatted, the LSN we write into the log inode will be zero. If we only log it once, recovery will run and can write this zero LSN into the inode.
This means that the next time the inode is logged and log recovery runs, it will *always* replay changes to the inode regardless of whether the inode is newer on disk than the version in the log and that violates the entire purpose of recording the LSN in the inode at writeback time (i.e. to stop it going backwards in time on disk during recovery).
Secondly, if we commit the CIL to the journal so the inode item moves to the AIL, and then relog the inode, the LSN that gets stamped into the log inode will be the LSN of the inode's current location in the AIL, not it's age on disk. And it's not the LSN that will be associated with the current change. That means when log recovery replays this inode item, the LSN that ends up on disk is the LSN for the previous changes in the log, not the current changes being replayed. IOWs, after recovery the LSN on disk is not in sync with the LSN of the modifications that were replayed into the inode. This, again, violates the recovery ordering semantics that on-disk writeback LSNs provide.
Hence the inode LSN in the log dinode is -always- invalid.
Thirdly, recovery actually has the LSN of the log transaction it is replaying right at hand - it uses it to determine if it should replay the inode by comparing it to the on-disk inode's LSN. But it doesn't use that LSN to stamp the LSN into the inode which will be written back when the transaction is fully replayed. It uses the one in the log dinode, which we know is always going to be incorrect.
Looking back at the change history, the inode logging was broken by commit 93f958f9c41f ("xfs: cull unnecessary icdinode fields") way back in 2016 by a stupid idiot who thought he knew how this code worked. i.e. me. That commit replaced an in memory di_lsn field that was updated only at inode writeback time from the inode item.li_lsn value - and hence always contained the same LSN that appeared in the on-disk inode - with a read of the inode item LSN at inode format time. CLearly these are not the same thing.
Before 93f958f9c41f, the log recovery behaviour was irrelevant, because the LSN in the log inode always matched the on-disk LSN at the time the inode was logged, hence recovery of the transaction would never make the on-disk LSN in the inode go backwards or get out of sync.
A symptom of the problem is this, caught from a failure of generic/482. Before log recovery, the inode has been allocated but never used:
xfs_db> inode 393388 xfs_db> p core.magic = 0x494e core.mode = 0 .... v3.crc = 0x99126961 (correct) v3.change_count = 0 v3.lsn = 0 v3.flags2 = 0 v3.cowextsize = 0 v3.crtime.sec = Thu Jan 1 10:00:00 1970 v3.crtime.nsec = 0
After log recovery:
xfs_db> p core.magic = 0x494e core.mode = 020444 .... v3.crc = 0x23e68f23 (correct) v3.change_count = 2 v3.lsn = 0 v3.flags2 = 0 v3.cowextsize = 0 v3.crtime.sec = Thu Jul 22 17:03:03 2021 v3.crtime.nsec = 751000000 ...
You can see that the LSN of the on-disk inode is 0, even though it clearly has been written to disk. I point out this inode, because the generic/482 failure occurred because several adjacent inodes in this specific inode cluster were not replayed correctly and still appeared to be zero on disk when all the other metadata (inobt, finobt, directories, etc) indicated they should be allocated and written back.
The fix for this is two-fold. The first is that we need to either revert the LSN changes in 93f958f9c41f or stop logging the inode LSN altogether. If we do the former, log recovery does not need to change but we add 8 bytes of memory per inode to store what is largely a write-only inode field. If we do the latter, log recovery needs to stamp the on-disk inode in the same manner that inode writeback does.
I prefer the latter, because we shouldn't really be trying to log and replay changes to the on disk LSN as the on-disk value is the canonical source of the on-disk version of the inode. It also matches the way we recover buffer items - we create a buf_log_item that carries the current recovery transaction LSN that gets stamped into the buffer by the write verifier when it gets written back when the transaction is fully recovered.
However, this might break log recovery on older kernels even more, so I'm going to simply ignore the logged value in recovery and stamp the on-disk inode with the LSN of the transaction being recovered that will trigger writeback on transaction recovery completion. This will ensure that the on-disk inode LSN always reflects the LSN of the last change that was written to disk, regardless of whether it comes from log recovery or runtime writeback.
Fixes: 93f958f9c41f ("xfs: cull unnecessary icdinode fields") Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/libxfs/xfs_log_format.h | 11 +++++++++- fs/xfs/xfs_inode_item_recover.c | 39 ++++++++++++++++++++++++--------- 2 files changed, 39 insertions(+), 11 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 16587219549c..954f137d10b7 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -411,7 +411,16 @@ struct xfs_log_dinode { /* start of the extended dinode, writable fields */ uint32_t di_crc; /* CRC of the inode */ uint64_t di_changecount; /* number of attribute changes */ - xfs_lsn_t di_lsn; /* flush sequence */ + + /* + * The LSN we write to this field during formatting is not a reflection + * of the current on-disk LSN. It should never be used for recovery + * sequencing, nor should it be recovered into the on-disk inode at all. + * See xlog_recover_inode_commit_pass2() and xfs_log_dinode_to_disk() + * for details. + */ + xfs_lsn_t di_lsn; + uint64_t di_flags2; /* more random flags */ uint32_t di_cowextsize; /* basic cow extent size for file */ uint8_t di_pad2[12]; /* more padding for future expansion */ diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index cb44f7653f03..538724f9f85c 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -145,7 +145,8 @@ xfs_log_dinode_to_disk_ts( STATIC void xfs_log_dinode_to_disk( struct xfs_log_dinode *from, - struct xfs_dinode *to) + struct xfs_dinode *to, + xfs_lsn_t lsn) { to->di_magic = cpu_to_be16(from->di_magic); to->di_mode = cpu_to_be16(from->di_mode); @@ -182,7 +183,7 @@ xfs_log_dinode_to_disk( to->di_flags2 = cpu_to_be64(from->di_flags2); to->di_cowextsize = cpu_to_be32(from->di_cowextsize); to->di_ino = cpu_to_be64(from->di_ino); - to->di_lsn = cpu_to_be64(from->di_lsn); + to->di_lsn = cpu_to_be64(lsn); memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); uuid_copy(&to->di_uuid, &from->di_uuid); to->di_flushiter = 0; @@ -261,16 +262,25 @@ xlog_recover_inode_commit_pass2( }
/* - * If the inode has an LSN in it, recover the inode only if it's less - * than the lsn of the transaction we are replaying. Note: we still - * need to replay an owner change even though the inode is more recent - * than the transaction as there is no guarantee that all the btree - * blocks are more recent than this transaction, too. + * If the inode has an LSN in it, recover the inode only if the on-disk + * inode's LSN is older than the lsn of the transaction we are + * replaying. We can have multiple checkpoints with the same start LSN, + * so the current LSN being equal to the on-disk LSN doesn't necessarily + * mean that the on-disk inode is more recent than the change being + * replayed. + * + * We must check the current_lsn against the on-disk inode + * here because the we can't trust the log dinode to contain a valid LSN + * (see comment below before replaying the log dinode for details). + * + * Note: we still need to replay an owner change even though the inode + * is more recent than the transaction as there is no guarantee that all + * the btree blocks are more recent than this transaction, too. */ if (dip->di_version >= 3) { xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn);
- if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) > 0) { trace_xfs_log_recover_inode_skip(log, in_f); error = 0; goto out_owner_change; @@ -368,8 +378,17 @@ xlog_recover_inode_commit_pass2( goto out_release; }
- /* recover the log dinode inode into the on disk inode */ - xfs_log_dinode_to_disk(ldip, dip); + /* + * Recover the log dinode inode into the on disk inode. + * + * The LSN in the log dinode is garbage - it can be zero or reflect + * stale in-memory runtime state that isn't coherent with the changes + * logged in this transaction or the changes written to the on-disk + * inode. Hence we write the current lSN into the inode because that + * matches what xfs_iflush() would write inode the inode when flushing + * the changes in this transaction. + */ + xfs_log_dinode_to_disk(ldip, dip, current_lsn);
fields = in_f->ilf_fields; if (fields & XFS_ILOG_DEV)
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.14-rc1 commit d8f4c2d0398fa1d92cacf854daf80d21a46bfefc category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
From the department of "WTAF? How did we miss that!?"...
When we are recovering a buffer, the first thing we do is check the buffer magic number and extract the LSN from the buffer. If the LSN is older than the current LSN, we replay the modification to it. If the metadata on disk is newer than the transaction in the log, we skip it. This is a fundamental v5 filesystem metadata recovery behaviour.
generic/482 failed with an attribute writeback failure during log recovery. The write verifier caught the corruption before it got written to disk, and the attr buffer dump looked like:
XFS (dm-3): Metadata corruption detected at xfs_attr3_leaf_verify+0x275/0x2e0, xfs_attr3_leaf block 0x19be8 XFS (dm-3): Unmount and run xfs_repair XFS (dm-3): First 128 bytes of corrupted metadata buffer: 00000000: 00 00 00 00 00 00 00 00 3b ee 00 00 4d 2a 01 e1 ........;...M*.. 00000010: 00 00 00 00 00 01 9b e8 00 00 00 01 00 00 05 38 ...............8 ^^^^^^^^^^^^^^^^^^^^^^^ 00000020: df 39 5e 51 58 ac 44 b6 8d c5 e7 10 44 09 bc 17 .9^QX.D.....D... 00000030: 00 00 00 00 00 02 00 83 00 03 00 cc 0f 24 01 00 .............$.. 00000040: 00 68 0e bc 0f c8 00 10 00 00 00 00 00 00 00 00 .h.............. 00000050: 00 00 3c 31 0f 24 01 00 00 00 3c 32 0f 88 01 00 ..<1.$....<2.... 00000060: 00 00 3c 33 0f d8 01 00 00 00 00 00 00 00 00 00 ..<3............ 00000070: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ .....
The highlighted bytes are the LSN that was replayed into the buffer: 0x100000538. This is cycle 1, block 0x538. Prior to replay, that block on disk looks like this:
$ sudo xfs_db -c "fsb 0x417d" -c "type attr3" -c p /dev/mapper/thin-vol hdr.info.hdr.forw = 0 hdr.info.hdr.back = 0 hdr.info.hdr.magic = 0x3bee hdr.info.crc = 0xb5af0bc6 (correct) hdr.info.bno = 105448 hdr.info.lsn = 0x100000900 ^^^^^^^^^^^ hdr.info.uuid = df395e51-58ac-44b6-8dc5-e7104409bc17 hdr.info.owner = 131203 hdr.count = 2 hdr.usedbytes = 120 hdr.firstused = 3796 hdr.holes = 1 hdr.freemap[0-2] = [base,size]
Note the LSN stamped into the buffer on disk: 1/0x900. The version on disk is much newer than the log transaction that was being replayed. That's a bug, and should -never- happen.
So I immediately went to look at xlog_recover_get_buf_lsn() to check that we handled the LSN correctly. I was wondering if there was a similar "two commits with the same start LSN skips the second replay" problem with buffers. I didn't get that far, because I found a much more basic, rudimentary bug: xlog_recover_get_buf_lsn() doesn't recognise buffers with XFS_ATTR3_LEAF_MAGIC set in them!!!
IOWs, attr3 leaf buffers fall through the magic number checks unrecognised, so trigger the "recover immediately" behaviour instead of undergoing an LSN check. IOWs, we incorrectly replay ATTR3 leaf buffers and that causes silent on disk corruption of inode attribute forks and potentially other things....
Git history shows this is *another* zero day bug, this time introduced in commit 50d5c8d8e938 ("xfs: check LSN ordering for v5 superblocks during recovery") which failed to handle the attr3 leaf buffers in recovery. And we've failed to handle them ever since...
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_buf_item_recover.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index d44e8b4a3391..05fd816edf59 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -796,6 +796,7 @@ xlog_recover_get_buf_lsn( switch (magicda) { case XFS_DIR3_LEAF1_MAGIC: case XFS_DIR3_LEAFN_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: case XFS_DA3_NODE_MAGIC: lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn); uuid = &((struct xfs_da3_blkinfo *)blk)->uuid;
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.14-rc1 commit 81a448d7b0668ae39c08e6f34a54cc7eafb844f1 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
While reviewing the buffer item recovery code, the thought occurred to me: in V5 filesystems we use log sequence number (LSN) tracking to avoid replaying older metadata updates against newer log items. However, we use the magic number of the ondisk buffer to find the LSN of the ondisk metadata, which means that if an attacker can control the layout of the realtime device precisely enough that the start of an rt bitmap block matches the magic and UUID of some other kind of block, they can control the purported LSN of that spoofed block and thereby break log replay.
Since realtime bitmap and summary blocks don't have headers at all, we have no way to tell if a block really should be replayed. The best we can do is replay unconditionally and hope for the best.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Reviewed-by: Carlos Maiolino cmaiolino@redhat.com Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_buf_item_recover.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 05fd816edf59..4775485b4062 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -698,7 +698,8 @@ xlog_recover_do_inode_buffer( static xfs_lsn_t xlog_recover_get_buf_lsn( struct xfs_mount *mp, - struct xfs_buf *bp) + struct xfs_buf *bp, + struct xfs_buf_log_format *buf_f) { uint32_t magic32; uint16_t magic16; @@ -706,11 +707,20 @@ xlog_recover_get_buf_lsn( void *blk = bp->b_addr; uuid_t *uuid; xfs_lsn_t lsn = -1; + uint16_t blft;
/* v4 filesystems always recover immediately */ if (!xfs_sb_version_hascrc(&mp->m_sb)) goto recover_immediately;
+ /* + * realtime bitmap and summary file blocks do not have magic numbers or + * UUIDs, so we must recover them immediately. + */ + blft = xfs_blft_from_flags(buf_f); + if (blft == XFS_BLFT_RTBITMAP_BUF || blft == XFS_BLFT_RTSUMMARY_BUF) + goto recover_immediately; + magic32 = be32_to_cpu(*(__be32 *)blk); switch (magic32) { case XFS_ABTB_CRC_MAGIC: @@ -920,7 +930,7 @@ xlog_recover_buf_commit_pass2( * the verifier will be reset to match whatever recover turns that * buffer into. */ - lsn = xlog_recover_get_buf_lsn(mp, bp); + lsn = xlog_recover_get_buf_lsn(mp, bp, buf_f); if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { trace_xfs_log_recover_buf_skip(log, buf_f); xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN);
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.14-rc4 commit f1653c2e2831e9db6cd68473bbec581782df03a5 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
We need to move to per-cpu state for both deferred inode inactivation and CIL tracking, but to do that we need to handle CPUs being removed from the system by the hot-plug code. Introduce generic XFS infrastructure to handle CPU hotplug events that is set up at module init time and torn down at module exit time.
Initially, we only need CPU dead notifications, so we only set up a callback for these notifications. The infrastructure can be updated in future for other CPU hotplug state machine notifications easily if ever needed.
Signed-off-by: Dave Chinner dchinner@redhat.com [djwong: rearrange some macros, fix function prototypes] Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_super.c | 42 +++++++++++++++++++++++++++++++++++++- include/linux/cpuhotplug.h | 1 + 2 files changed, 42 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index e3e229e52512..1d0eb196eea6 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -2099,6 +2099,39 @@ xfs_destroy_workqueues(void) destroy_workqueue(xfs_alloc_wq); }
+#ifdef CONFIG_HOTPLUG_CPU +static int +xfs_cpu_dead( + unsigned int cpu) +{ + return 0; +} + +static int __init +xfs_cpu_hotplug_init(void) +{ + int error; + + error = cpuhp_setup_state_nocalls(CPUHP_XFS_DEAD, "xfs:dead", NULL, + xfs_cpu_dead); + if (error < 0) + xfs_alert(NULL, +"Failed to initialise CPU hotplug, error %d. XFS is non-functional.", + error); + return error; +} + +static void +xfs_cpu_hotplug_destroy(void) +{ + cpuhp_remove_state_nocalls(CPUHP_XFS_DEAD); +} + +#else /* !CONFIG_HOTPLUG_CPU */ +static inline int xfs_cpu_hotplug_init(void) { return 0; } +static inline void xfs_cpu_hotplug_destroy(void) {} +#endif + STATIC int __init init_xfs_fs(void) { @@ -2111,10 +2144,14 @@ init_xfs_fs(void)
xfs_dir_startup();
- error = xfs_init_zones(); + error = xfs_cpu_hotplug_init(); if (error) goto out;
+ error = xfs_init_zones(); + if (error) + goto out_destroy_hp; + error = xfs_init_workqueues(); if (error) goto out_destroy_zones; @@ -2194,6 +2231,8 @@ init_xfs_fs(void) xfs_destroy_workqueues(); out_destroy_zones: xfs_destroy_zones(); + out_destroy_hp: + xfs_cpu_hotplug_destroy(); out: return error; } @@ -2216,6 +2255,7 @@ exit_xfs_fs(void) xfs_destroy_workqueues(); xfs_destroy_zones(); xfs_uuid_table_free(); + xfs_cpu_hotplug_destroy(); }
module_init(init_xfs_fs); diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 3fb643bf3bc0..b98b9eb7d5f8 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -52,6 +52,7 @@ enum cpuhp_state { CPUHP_FS_BUFF_DEAD, CPUHP_PRINTK_DEAD, CPUHP_MM_MEMCQ_DEAD, + CPUHP_XFS_DEAD, CPUHP_PERCPU_CNT_DEAD, CPUHP_RADIX_DEAD, CPUHP_PAGE_ALLOC_DEAD,
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.14-rc4 commit 0ed17f01c85401abf1706d9825796fc6661c0889 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
The inode inactivation and CIL tracking percpu structures are per-xfs_mount structures. That means when we get a CPU dead notification, we need to then iterate all the per-cpu structure instances to process them. Rather than keeping linked lists of per-cpu structures in each subsystem, add a list of all xfs_mounts that the generic xfs_cpu_dead() function will iterate and call into each subsystem appropriately.
This allows us to handle both per-mount and global XFS percpu state from xfs_cpu_dead(), and avoids the need to link subsystem structures that can be easily found from the xfs_mount into their own global lists.
Signed-off-by: Dave Chinner dchinner@redhat.com [djwong: expand some comments about mount list setup ordering rules] Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_mount.h | 1 + fs/xfs/xfs_super.c | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+)
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index dfa429b77ee2..481d1e52b541 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -81,6 +81,7 @@ typedef struct xfs_mount { xfs_buftarg_t *m_ddev_targp; /* saves taking the address */ xfs_buftarg_t *m_logdev_targp;/* ptr to log device */ xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */ + struct list_head m_mount_list; /* global mount list */ /* * Optional cache of rt summary level per bitmap block with the * invariant that m_rsum_cache[bbno] <= the minimum i for which diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 1d0eb196eea6..1eca53df25f6 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -47,6 +47,28 @@ static struct kset *xfs_kset; /* top-level xfs sysfs dir */ static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */ #endif
+#ifdef CONFIG_HOTPLUG_CPU +static LIST_HEAD(xfs_mount_list); +static DEFINE_SPINLOCK(xfs_mount_list_lock); + +static inline void xfs_mount_list_add(struct xfs_mount *mp) +{ + spin_lock(&xfs_mount_list_lock); + list_add(&mp->m_mount_list, &xfs_mount_list); + spin_unlock(&xfs_mount_list_lock); +} + +static inline void xfs_mount_list_del(struct xfs_mount *mp) +{ + spin_lock(&xfs_mount_list_lock); + list_del(&mp->m_mount_list); + spin_unlock(&xfs_mount_list_lock); +} +#else /* !CONFIG_HOTPLUG_CPU */ +static inline void xfs_mount_list_add(struct xfs_mount *mp) {} +static inline void xfs_mount_list_del(struct xfs_mount *mp) {} +#endif + enum xfs_dax_mode { XFS_DAX_INODE = 0, XFS_DAX_ALWAYS = 1, @@ -1076,6 +1098,7 @@ xfs_fs_put_super(
xfs_freesb(mp); free_percpu(mp->m_stats.xs_stats); + xfs_mount_list_del(mp); xfs_destroy_percpu_counters(mp); xfs_destroy_mount_workqueues(mp); xfs_close_devices(mp); @@ -1435,6 +1458,13 @@ xfs_fc_fill_super( if (error) goto out_destroy_workqueues;
+ /* + * All percpu data structures requiring cleanup when a cpu goes offline + * must be allocated before adding this @mp to the cpu-dead handler's + * mount list. + */ + xfs_mount_list_add(mp); + /* Allocate stats memory before we do operations that might use it */ mp->m_stats.xs_stats = alloc_percpu(struct xfsstats); if (!mp->m_stats.xs_stats) { @@ -1604,6 +1634,7 @@ xfs_fc_fill_super( out_free_stats: free_percpu(mp->m_stats.xs_stats); out_destroy_counters: + xfs_mount_list_del(mp); xfs_destroy_percpu_counters(mp); out_destroy_workqueues: xfs_destroy_mount_workqueues(mp); @@ -2104,6 +2135,15 @@ static int xfs_cpu_dead( unsigned int cpu) { + struct xfs_mount *mp, *n; + + spin_lock(&xfs_mount_list_lock); + list_for_each_entry_safe(mp, n, &xfs_mount_list, m_mount_list) { + spin_unlock(&xfs_mount_list_lock); + /* xfs_subsys_dead(mp, cpu); */ + spin_lock(&xfs_mount_list_lock); + } + spin_unlock(&xfs_mount_list_lock); return 0; }
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.14-rc4 commit de2860f4636256836450c6543be744a50118fc66 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
During log recovery of an XFS filesystem with 64kB directory buffers, rebuilding a buffer split across two log records results in a memory allocation warning from krealloc like this:
xfs filesystem being mounted at /mnt/scratch supports timestamps until 2038 (0x7fffffff) XFS (dm-0): Unmounting Filesystem XFS (dm-0): Mounting V5 Filesystem XFS (dm-0): Starting recovery (logdev: internal) ------------[ cut here ]------------ WARNING: CPU: 5 PID: 3435170 at mm/page_alloc.c:3539 get_page_from_freelist+0xdee/0xe40 ..... RIP: 0010:get_page_from_freelist+0xdee/0xe40 Call Trace: ? complete+0x3f/0x50 __alloc_pages+0x16f/0x300 alloc_pages+0x87/0x110 kmalloc_order+0x2c/0x90 kmalloc_order_trace+0x1d/0x90 __kmalloc_track_caller+0x215/0x270 ? xlog_recover_add_to_cont_trans+0x63/0x1f0 krealloc+0x54/0xb0 xlog_recover_add_to_cont_trans+0x63/0x1f0 xlog_recovery_process_trans+0xc1/0xd0 xlog_recover_process_ophdr+0x86/0x130 xlog_recover_process_data+0x9f/0x160 xlog_recover_process+0xa2/0x120 xlog_do_recovery_pass+0x40b/0x7d0 ? __irq_work_queue_local+0x4f/0x60 ? irq_work_queue+0x3a/0x50 xlog_do_log_recovery+0x70/0x150 xlog_do_recover+0x38/0x1d0 xlog_recover+0xd8/0x170 xfs_log_mount+0x181/0x300 xfs_mountfs+0x4a1/0x9b0 xfs_fs_fill_super+0x3c0/0x7b0 get_tree_bdev+0x171/0x270 ? suffix_kstrtoint.constprop.0+0xf0/0xf0 xfs_fs_get_tree+0x15/0x20 vfs_get_tree+0x24/0xc0 path_mount+0x2f5/0xaf0 __x64_sys_mount+0x108/0x140 do_syscall_64+0x3a/0x70 entry_SYSCALL_64_after_hwframe+0x44/0xae
Essentially, we are taking a multi-order allocation from kmem_alloc() (which has an open coded no fail, no warn loop) and then reallocating it out to 64kB using krealloc(__GFP_NOFAIL) and that is then triggering the above warning.
This is a regression caused by converting this code from an open coded no fail/no warn reallocation loop to using __GFP_NOFAIL.
What we actually need here is kvrealloc(), so that if contiguous page allocation fails we fall back to vmalloc() and we don't get nasty warnings happening in XFS.
Fixes: 771915c4f688 ("xfs: remove kmem_realloc()") Signed-off-by: Dave Chinner dchinner@redhat.com Acked-by: Mel Gorman mgorman@techsingularity.net Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_log_recover.c | 4 +++- include/linux/mm.h | 2 ++ mm/util.c | 15 +++++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 69408782019e..e61f28ce3e44 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2061,7 +2061,9 @@ xlog_recover_add_to_cont_trans( old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; old_len = item->ri_buf[item->ri_cnt-1].i_len;
- ptr = krealloc(old_ptr, len + old_len, GFP_KERNEL | __GFP_NOFAIL); + ptr = kvrealloc(old_ptr, old_len, len + old_len, GFP_KERNEL); + if (!ptr) + return -ENOMEM; memcpy(&ptr[old_len], dp, len); item->ri_buf[item->ri_cnt-1].i_len += len; item->ri_buf[item->ri_cnt-1].i_addr = ptr; diff --git a/include/linux/mm.h b/include/linux/mm.h index 100c113e62a7..e30344cd6291 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -810,6 +810,8 @@ static inline void *kvcalloc(size_t n, size_t size, gfp_t flags) return kvmalloc_array(n, size, flags | __GFP_ZERO); }
+extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize, + gfp_t flags); extern void kvfree(const void *addr); extern void kvfree_sensitive(const void *addr, size_t len);
diff --git a/mm/util.c b/mm/util.c index 90792e4eaa25..47d074912630 100644 --- a/mm/util.c +++ b/mm/util.c @@ -627,6 +627,21 @@ void kvfree_sensitive(const void *addr, size_t len) } EXPORT_SYMBOL(kvfree_sensitive);
+void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags) +{ + void *newp; + + if (oldsize >= newsize) + return (void *)p; + newp = kvmalloc(newsize, flags); + if (!newp) + return NULL; + memcpy(newp, p, oldsize); + kvfree(p); + return newp; +} +EXPORT_SYMBOL(kvrealloc); + static inline void *__page_rmapping(struct page *page) { unsigned long mapping;
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.14-rc4 commit 98fe2c3cef21b784e2efd1d9d891430d95b4f073 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Since commit 59bb47985c1d ("mm, sl[aou]b: guarantee natural alignment for kmalloc(power-of-two)"), the core slab code now guarantees slab alignment in all situations sufficient for IO purposes (i.e. minimum of 512 byte alignment of >= 512 byte sized heap allocations) we no longer need the workaround in the XFS code to provide this guarantee.
Replace the use of kmem_alloc_io() with kmem_alloc() or kmem_alloc_large() appropriately, and remove the kmem_alloc_io() interface altogether.
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/kmem.c | 25 ------------------------- fs/xfs/kmem.h | 1 - fs/xfs/xfs_buf.c | 6 ++---- fs/xfs/xfs_buf.h | 6 ------ fs/xfs/xfs_log.c | 3 +-- fs/xfs/xfs_log_recover.c | 4 +--- fs/xfs/xfs_trace.h | 1 - 7 files changed, 4 insertions(+), 42 deletions(-)
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index e986b95d94c9..3f2979fd2f2b 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -56,31 +56,6 @@ __kmem_vmalloc(size_t size, xfs_km_flags_t flags) return ptr; }
-/* - * Same as kmem_alloc_large, except we guarantee the buffer returned is aligned - * to the @align_mask. We only guarantee alignment up to page size, we'll clamp - * alignment at page size if it is larger. vmalloc always returns a PAGE_SIZE - * aligned region. - */ -void * -kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags) -{ - void *ptr; - - trace_kmem_alloc_io(size, flags, _RET_IP_); - - if (WARN_ON_ONCE(align_mask >= PAGE_SIZE)) - align_mask = PAGE_SIZE - 1; - - ptr = kmem_alloc(size, flags | KM_MAYFAIL); - if (ptr) { - if (!((uintptr_t)ptr & align_mask)) - return ptr; - kfree(ptr); - } - return __kmem_vmalloc(size, flags); -} - void * kmem_alloc_large(size_t size, xfs_km_flags_t flags) { diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 38007117697e..9ff20047f8b8 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -57,7 +57,6 @@ kmem_flags_convert(xfs_km_flags_t flags) }
extern void *kmem_alloc(size_t, xfs_km_flags_t); -extern void *kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags); extern void *kmem_alloc_large(size_t size, xfs_km_flags_t); static inline void kmem_free(const void *ptr) { diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 118819030dbb..21d1d3d14e3b 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -361,7 +361,7 @@ xfs_buf_allocate_memory( unsigned short page_count, i; xfs_off_t start, end; int error; - xfs_km_flags_t kmflag_mask = 0; + xfs_km_flags_t kmflag_mask = KM_NOFS;
/* * assure zeroed buffer for non-read cases. @@ -378,9 +378,7 @@ xfs_buf_allocate_memory( */ size = BBTOB(bp->b_length); if (size < PAGE_SIZE) { - int align_mask = xfs_buftarg_dma_alignment(bp->b_target); - bp->b_addr = kmem_alloc_io(size, align_mask, - KM_NOFS | kmflag_mask); + bp->b_addr = kmem_alloc(size, kmflag_mask); if (!bp->b_addr) { /* low memory - use alloc_page loop instead */ goto use_alloc_page; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index bfd2907e7bc4..6ab044cea75d 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -353,12 +353,6 @@ extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int); #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
-static inline int -xfs_buftarg_dma_alignment(struct xfs_buftarg *bt) -{ - return queue_dma_alignment(bt->bt_bdev->bd_disk->queue); -} - int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops); bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic); bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 447a8903dd06..0e5645dd007b 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1372,7 +1372,6 @@ xlog_alloc_log( */ ASSERT(log->l_iclog_size >= 4096); for (i = 0; i < log->l_iclog_bufs; i++) { - int align_mask = xfs_buftarg_dma_alignment(mp->m_logdev_targp); size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) * sizeof(struct bio_vec);
@@ -1384,7 +1383,7 @@ xlog_alloc_log( iclog->ic_prev = prev_iclog; prev_iclog = iclog;
- iclog->ic_data = kmem_alloc_io(log->l_iclog_size, align_mask, + iclog->ic_data = kmem_alloc_large(log->l_iclog_size, KM_MAYFAIL | KM_ZERO); if (!iclog->ic_data) goto out_free_iclog; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index e61f28ce3e44..e09735871ab6 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -78,8 +78,6 @@ xlog_alloc_buffer( struct xlog *log, int nbblks) { - int align_mask = xfs_buftarg_dma_alignment(log->l_targ); - /* * Pass log block 0 since we don't have an addr yet, buffer will be * verified on read. @@ -107,7 +105,7 @@ xlog_alloc_buffer( if (nbblks > 1 && log->l_sectBBsize > 1) nbblks += log->l_sectBBsize; nbblks = round_up(nbblks, log->l_sectBBsize); - return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL | KM_ZERO); + return kmem_alloc_large(BBTOB(nbblks), KM_MAYFAIL | KM_ZERO); }
/* diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 86951652d3ed..02e6a0cf548c 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3675,7 +3675,6 @@ DEFINE_EVENT(xfs_kmem_class, name, \ TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), \ TP_ARGS(size, flags, caller_ip)) DEFINE_KMEM_EVENT(kmem_alloc); -DEFINE_KMEM_EVENT(kmem_alloc_io); DEFINE_KMEM_EVENT(kmem_alloc_large);
TRACE_EVENT(xfs_check_new_dalign,
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.14-rc4 commit d634525db63e9e946c3229fb93c8d9b763afbaf3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
There is no reason for this wrapper existing anymore. All the places that use KM_NOFS allocation are within transaction contexts and hence covered by memalloc_nofs_save/restore contexts. Hence we don't need any special handling of vmalloc for large IOs anymore and so special casing this code isn't necessary.
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/kmem.c | 39 ----------------------------------- fs/xfs/kmem.h | 1 - fs/xfs/libxfs/xfs_attr_leaf.c | 2 +- fs/xfs/scrub/attr.c | 14 +++++++------ fs/xfs/scrub/attr.h | 3 --- fs/xfs/xfs_log.c | 4 ++-- fs/xfs/xfs_log_cil.c | 10 ++++++++- fs/xfs/xfs_log_recover.c | 2 +- fs/xfs/xfs_trace.h | 1 - 9 files changed, 21 insertions(+), 55 deletions(-)
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 3f2979fd2f2b..6f49bf39183c 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -29,42 +29,3 @@ kmem_alloc(size_t size, xfs_km_flags_t flags) congestion_wait(BLK_RW_ASYNC, HZ/50); } while (1); } - - -/* - * __vmalloc() will allocate data pages and auxiliary structures (e.g. - * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context here. Hence - * we need to tell memory reclaim that we are in such a context via - * PF_MEMALLOC_NOFS to prevent memory reclaim re-entering the filesystem here - * and potentially deadlocking. - */ -static void * -__kmem_vmalloc(size_t size, xfs_km_flags_t flags) -{ - unsigned nofs_flag = 0; - void *ptr; - gfp_t lflags = kmem_flags_convert(flags); - - if (flags & KM_NOFS) - nofs_flag = memalloc_nofs_save(); - - ptr = __vmalloc(size, lflags); - - if (flags & KM_NOFS) - memalloc_nofs_restore(nofs_flag); - - return ptr; -} - -void * -kmem_alloc_large(size_t size, xfs_km_flags_t flags) -{ - void *ptr; - - trace_kmem_alloc_large(size, flags, _RET_IP_); - - ptr = kmem_alloc(size, flags | KM_MAYFAIL); - if (ptr) - return ptr; - return __kmem_vmalloc(size, flags); -} diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 9ff20047f8b8..54da6d717a06 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -57,7 +57,6 @@ kmem_flags_convert(xfs_km_flags_t flags) }
extern void *kmem_alloc(size_t, xfs_km_flags_t); -extern void *kmem_alloc_large(size_t size, xfs_km_flags_t); static inline void kmem_free(const void *ptr) { kvfree(ptr); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index d6ef69ab1c67..d043527d8409 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -488,7 +488,7 @@ xfs_attr_copy_value( }
if (!args->value) { - args->value = kmem_alloc_large(valuelen, KM_NOLOCKDEP); + args->value = kvmalloc(valuelen, GFP_KERNEL | __GFP_NOLOCKDEP); if (!args->value) return -ENOMEM; } diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 9faddb334a2c..b5c071258fdf 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -25,11 +25,11 @@ * reallocating the buffer if necessary. Buffer contents are not preserved * across a reallocation. */ -int +static int xchk_setup_xattr_buf( struct xfs_scrub *sc, size_t value_size, - xfs_km_flags_t flags) + gfp_t flags) { size_t sz; struct xchk_xattr_buf *ab = sc->buf; @@ -57,7 +57,7 @@ xchk_setup_xattr_buf( * Don't zero the buffer upon allocation to avoid runtime overhead. * All users must be careful never to read uninitialized contents. */ - ab = kmem_alloc_large(sizeof(*ab) + sz, flags); + ab = kvmalloc(sizeof(*ab) + sz, flags); if (!ab) return -ENOMEM;
@@ -80,7 +80,7 @@ xchk_setup_xattr( * without the inode lock held, which means we can sleep. */ if (sc->flags & XCHK_TRY_HARDER) { - error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, 0); + error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, GFP_KERNEL); if (error) return error; } @@ -139,7 +139,8 @@ xchk_xattr_listent( * doesn't work, we overload the seen_enough variable to convey * the error message back to the main scrub function. */ - error = xchk_setup_xattr_buf(sx->sc, valuelen, KM_MAYFAIL); + error = xchk_setup_xattr_buf(sx->sc, valuelen, + GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (error == -ENOMEM) error = -EDEADLOCK; if (error) { @@ -324,7 +325,8 @@ xchk_xattr_block( return 0;
/* Allocate memory for block usage checking. */ - error = xchk_setup_xattr_buf(ds->sc, 0, KM_MAYFAIL); + error = xchk_setup_xattr_buf(ds->sc, 0, + GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (error == -ENOMEM) return -EDEADLOCK; if (error) diff --git a/fs/xfs/scrub/attr.h b/fs/xfs/scrub/attr.h index 13a1d2e8424d..1719e1c4da59 100644 --- a/fs/xfs/scrub/attr.h +++ b/fs/xfs/scrub/attr.h @@ -65,7 +65,4 @@ xchk_xattr_dstmap( BITS_TO_LONGS(sc->mp->m_attr_geo->blksize); }
-int xchk_setup_xattr_buf(struct xfs_scrub *sc, size_t value_size, - xfs_km_flags_t flags); - #endif /* __XFS_SCRUB_ATTR_H__ */ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 0e5645dd007b..200b92fae5bb 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1383,8 +1383,8 @@ xlog_alloc_log( iclog->ic_prev = prev_iclog; prev_iclog = iclog;
- iclog->ic_data = kmem_alloc_large(log->l_iclog_size, - KM_MAYFAIL | KM_ZERO); + iclog->ic_data = kvzalloc(log->l_iclog_size, + GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!iclog->ic_data) goto out_free_iclog; #ifdef DEBUG diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 4c44bc3786c0..4e41130f206f 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -185,7 +185,15 @@ xlog_cil_alloc_shadow_bufs( */ kmem_free(lip->li_lv_shadow);
- lv = kmem_alloc_large(buf_size, KM_NOFS); + /* + * We are in transaction context, which means this + * allocation will pick up GFP_NOFS from the + * memalloc_nofs_save/restore context the transaction + * holds. This means we can use GFP_KERNEL here so the + * generic kvmalloc() code will run vmalloc on + * contiguous page allocation failure as we require. + */ + lv = kvmalloc(buf_size, GFP_KERNEL); memset(lv, 0, xlog_cil_iovec_space(niovecs));
lv->lv_item = lip; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index e09735871ab6..d5a8bea48be2 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -105,7 +105,7 @@ xlog_alloc_buffer( if (nbblks > 1 && log->l_sectBBsize > 1) nbblks += log->l_sectBBsize; nbblks = round_up(nbblks, log->l_sectBBsize); - return kmem_alloc_large(BBTOB(nbblks), KM_MAYFAIL | KM_ZERO); + return kvzalloc(BBTOB(nbblks), GFP_KERNEL | __GFP_RETRY_MAYFAIL); }
/* diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 02e6a0cf548c..54a7b3c63d7b 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3675,7 +3675,6 @@ DEFINE_EVENT(xfs_kmem_class, name, \ TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), \ TP_ARGS(size, flags, caller_ip)) DEFINE_KMEM_EVENT(kmem_alloc); -DEFINE_KMEM_EVENT(kmem_alloc_large);
TRACE_EVENT(xfs_check_new_dalign, TP_PROTO(struct xfs_mount *mp, int new_dalign, xfs_ino_t calc_rootino),
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.14-rc4 commit c02f6529864a4f5f91d216d324bac4ba75415d19 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
In commit 8ad560d2565e, we changed xfs_rtalloc_query_range to constrain the range of bits in the realtime bitmap file that would actually be searched. In commit a3a374bf1889, we changed the range again (incorrectly), leading to the fix in commit d88850bd5516, which finally corrected the range check code. Unfortunately, the author never noticed that the function modifies its input parameters, which is a totaly no-no since none of the other range query functions change their input parameters.
So, fix this function yet again to stash the upper end of the query range (i.e. the high key) in a local variable and hope this is the last time I have to fix my own function. While we're at it, mark the key inputs const so nobody makes this mistake again. :(
Fixes: 8ad560d2565e ("xfs: strengthen rtalloc query range checks") Not-fixed-by: a3a374bf1889 ("xfs: fix off-by-one error in xfs_rtalloc_query_range") Not-fixed-by: d88850bd5516 ("xfs: fix high key handling in the rt allocator's query_range function") Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Chandan Babu R chandanrlinux@gmail.com Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/libxfs/xfs_rtbitmap.c | 14 +++++++------- fs/xfs/xfs_rtalloc.h | 7 +++---- 2 files changed, 10 insertions(+), 11 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 6c1aba16113c..2bfc66a162c2 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1009,8 +1009,8 @@ xfs_rtfree_extent( int xfs_rtalloc_query_range( struct xfs_trans *tp, - struct xfs_rtalloc_rec *low_rec, - struct xfs_rtalloc_rec *high_rec, + const struct xfs_rtalloc_rec *low_rec, + const struct xfs_rtalloc_rec *high_rec, xfs_rtalloc_query_range_fn fn, void *priv) { @@ -1018,6 +1018,7 @@ xfs_rtalloc_query_range( struct xfs_mount *mp = tp->t_mountp; xfs_rtblock_t rtstart; xfs_rtblock_t rtend; + xfs_rtblock_t high_key; int is_free; int error = 0;
@@ -1026,12 +1027,12 @@ xfs_rtalloc_query_range( if (low_rec->ar_startext >= mp->m_sb.sb_rextents || low_rec->ar_startext == high_rec->ar_startext) return 0; - high_rec->ar_startext = min(high_rec->ar_startext, - mp->m_sb.sb_rextents - 1); + + high_key = min(high_rec->ar_startext, mp->m_sb.sb_rextents - 1);
/* Iterate the bitmap, looking for discrepancies. */ rtstart = low_rec->ar_startext; - while (rtstart <= high_rec->ar_startext) { + while (rtstart <= high_key) { /* Is the first block free? */ error = xfs_rtcheck_range(mp, tp, rtstart, 1, 1, &rtend, &is_free); @@ -1039,8 +1040,7 @@ xfs_rtalloc_query_range( break;
/* How long does the extent go for? */ - error = xfs_rtfind_forw(mp, tp, rtstart, - high_rec->ar_startext, &rtend); + error = xfs_rtfind_forw(mp, tp, rtstart, high_key, &rtend); if (error) break;
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 93e77b221355..05f92aea34fa 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -124,10 +124,9 @@ int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtblock_t start, xfs_extlen_t len, struct xfs_buf **rbpp, xfs_fsblock_t *rsb); int xfs_rtalloc_query_range(struct xfs_trans *tp, - struct xfs_rtalloc_rec *low_rec, - struct xfs_rtalloc_rec *high_rec, - xfs_rtalloc_query_range_fn fn, - void *priv); + const struct xfs_rtalloc_rec *low_rec, + const struct xfs_rtalloc_rec *high_rec, + xfs_rtalloc_query_range_fn fn, void *priv); int xfs_rtalloc_query_all(struct xfs_trans *tp, xfs_rtalloc_query_range_fn fn, void *priv);
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.14-rc4 commit 9ab72f22277487d2a3ffc8dd20fc918e186bb2b3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
The fsmap implementation for realtime devices uses the gap between info->next_daddr and a free rtextent reported by xfs_rtalloc_query_range to feed userspace fsmap records with an "unknown" owner. We use this trick to report to userspace when the last rtextent in the filesystem is in use by synthesizing a null rmap record starting at the next block after the query range.
Unfortunately, there's a minor accounting bug in the way that we construct the null rmap record. Originally, ahigh.ar_startext contains the last rtextent for which the user wants records. It's entirely possible that number is beyond the end of the rt volume, so the location synthesized rmap record /must/ be constrained to the minimum of the high key and the number of extents in the rt volume.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Chandan Babu R chandanrlinux@gmail.com Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_fsmap.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-)
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 9ce5e7d5bf8f..a7c42b25ba56 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -522,27 +522,37 @@ xfs_getfsmap_rtdev_rtbitmap_query( { struct xfs_rtalloc_rec alow = { 0 }; struct xfs_rtalloc_rec ahigh = { 0 }; + struct xfs_mount *mp = tp->t_mountp; int error;
- xfs_ilock(tp->t_mountp->m_rbmip, XFS_ILOCK_SHARED); + xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED);
+ /* + * Set up query parameters to return free rtextents covering the range + * we want. + */ alow.ar_startext = info->low.rm_startblock; ahigh.ar_startext = info->high.rm_startblock; - do_div(alow.ar_startext, tp->t_mountp->m_sb.sb_rextsize); - if (do_div(ahigh.ar_startext, tp->t_mountp->m_sb.sb_rextsize)) + do_div(alow.ar_startext, mp->m_sb.sb_rextsize); + if (do_div(ahigh.ar_startext, mp->m_sb.sb_rextsize)) ahigh.ar_startext++; error = xfs_rtalloc_query_range(tp, &alow, &ahigh, xfs_getfsmap_rtdev_rtbitmap_helper, info); if (error) goto err;
- /* Report any gaps at the end of the rtbitmap */ + /* + * Report any gaps at the end of the rtbitmap by simulating a null + * rmap starting at the block after the end of the query range. + */ info->last = true; + ahigh.ar_startext = min(mp->m_sb.sb_rextents, ahigh.ar_startext); + error = xfs_getfsmap_rtdev_rtbitmap_helper(tp, &ahigh, info); if (error) goto err; err: - xfs_iunlock(tp->t_mountp->m_rbmip, XFS_ILOCK_SHARED); + xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED); return error; }
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.14-rc4 commit 7e1826e05ba62e85fe110939a181c8f0d58c14cf category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
There are several GETFSMAP backend functions for XFS to cover the three devices and various feature support. Each of these functions are passed pointers to the low and high keys for the dataset that userspace requested, and a pointer to scratchpad variables that are used to control the iteration and fill out records. The scratchpad data can be changed arbitrarily, but the keys are supposed to remain unchanged (and under the control of the outermost loop in xfs_getfsmap).
Unfortunately, the data and rt backends modify the keys that are passed in from the main control loop, which causes subsequent calls to return incorrect query results. Specifically, each of those two functions set the block number in the high key to the size of their respective device. Since fsmap results are sorted in device number order, if the lower numbered device is smaller than the higher numbered device, the first function will set the high key to the small size, and the key remains unchanged as it is passed into the function for the higher numbered device. The second function will then fail to return all of the results for the dataset that userspace is asking for because the keyspace is incorrectly constrained.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Chandan Babu R chandanrlinux@gmail.com Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_fsmap.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-)
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index a7c42b25ba56..2d98d8cfae44 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -60,7 +60,7 @@ xfs_fsmap_to_internal( static int xfs_fsmap_owner_to_rmap( struct xfs_rmap_irec *dest, - struct xfs_fsmap *src) + const struct xfs_fsmap *src) { if (!(src->fmr_flags & FMR_OF_SPECIAL_OWNER)) { dest->rm_owner = src->fmr_owner; @@ -170,7 +170,7 @@ struct xfs_getfsmap_info { struct xfs_getfsmap_dev { u32 dev; int (*fn)(struct xfs_trans *tp, - struct xfs_fsmap *keys, + const struct xfs_fsmap *keys, struct xfs_getfsmap_info *info); };
@@ -388,7 +388,7 @@ xfs_getfsmap_datadev_bnobt_helper( static void xfs_getfsmap_set_irec_flags( struct xfs_rmap_irec *irec, - struct xfs_fsmap *fmr) + const struct xfs_fsmap *fmr) { irec->rm_flags = 0; if (fmr->fmr_flags & FMR_OF_ATTR_FORK) @@ -403,7 +403,7 @@ xfs_getfsmap_set_irec_flags( STATIC int xfs_getfsmap_logdev( struct xfs_trans *tp, - struct xfs_fsmap *keys, + const struct xfs_fsmap *keys, struct xfs_getfsmap_info *info) { struct xfs_mount *mp = tp->t_mountp; @@ -472,7 +472,7 @@ xfs_getfsmap_rtdev_rtbitmap_helper( STATIC int __xfs_getfsmap_rtdev( struct xfs_trans *tp, - struct xfs_fsmap *keys, + const struct xfs_fsmap *keys, int (*query_fn)(struct xfs_trans *, struct xfs_getfsmap_info *), struct xfs_getfsmap_info *info) @@ -480,16 +480,14 @@ __xfs_getfsmap_rtdev( struct xfs_mount *mp = tp->t_mountp; xfs_fsblock_t start_fsb; xfs_fsblock_t end_fsb; - xfs_daddr_t eofs; + uint64_t eofs; int error = 0;
eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); if (keys[0].fmr_physical >= eofs) return 0; - if (keys[1].fmr_physical >= eofs) - keys[1].fmr_physical = eofs - 1; start_fsb = XFS_BB_TO_FSBT(mp, keys[0].fmr_physical); - end_fsb = XFS_BB_TO_FSB(mp, keys[1].fmr_physical); + end_fsb = XFS_BB_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical));
/* Set up search keys */ info->low.rm_startblock = start_fsb; @@ -560,7 +558,7 @@ xfs_getfsmap_rtdev_rtbitmap_query( STATIC int xfs_getfsmap_rtdev_rtbitmap( struct xfs_trans *tp, - struct xfs_fsmap *keys, + const struct xfs_fsmap *keys, struct xfs_getfsmap_info *info) { info->missing_owner = XFS_FMR_OWN_UNKNOWN; @@ -573,7 +571,7 @@ xfs_getfsmap_rtdev_rtbitmap( STATIC int __xfs_getfsmap_datadev( struct xfs_trans *tp, - struct xfs_fsmap *keys, + const struct xfs_fsmap *keys, struct xfs_getfsmap_info *info, int (*query_fn)(struct xfs_trans *, struct xfs_getfsmap_info *, @@ -587,16 +585,14 @@ __xfs_getfsmap_datadev( xfs_fsblock_t end_fsb; xfs_agnumber_t start_ag; xfs_agnumber_t end_ag; - xfs_daddr_t eofs; + uint64_t eofs; int error = 0;
eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); if (keys[0].fmr_physical >= eofs) return 0; - if (keys[1].fmr_physical >= eofs) - keys[1].fmr_physical = eofs - 1; start_fsb = XFS_DADDR_TO_FSB(mp, keys[0].fmr_physical); - end_fsb = XFS_DADDR_TO_FSB(mp, keys[1].fmr_physical); + end_fsb = XFS_DADDR_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical));
/* * Convert the fsmap low/high keys to AG based keys. Initialize @@ -710,7 +706,7 @@ xfs_getfsmap_datadev_rmapbt_query( STATIC int xfs_getfsmap_datadev_rmapbt( struct xfs_trans *tp, - struct xfs_fsmap *keys, + const struct xfs_fsmap *keys, struct xfs_getfsmap_info *info) { info->missing_owner = XFS_FMR_OWN_FREE; @@ -745,7 +741,7 @@ xfs_getfsmap_datadev_bnobt_query( STATIC int xfs_getfsmap_datadev_bnobt( struct xfs_trans *tp, - struct xfs_fsmap *keys, + const struct xfs_fsmap *keys, struct xfs_getfsmap_info *info) { struct xfs_alloc_rec_incore akeys[2];
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.14-rc4 commit 61e0d0cc51cd6b9d7447923f3ac7e60049de3e2e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
The kernel test robot found the following bug when running xfs/355 to scrub a bmap btree:
XFS: Assertion failed: !sa->pag, file: fs/xfs/scrub/common.c, line: 412 ------------[ cut here ]------------ kernel BUG at fs/xfs/xfs_message.c:110! invalid opcode: 0000 [#1] SMP PTI CPU: 2 PID: 1415 Comm: xfs_scrub Not tainted 5.14.0-rc4-00021-g48c6615cc557 #1 Hardware name: Hewlett-Packard p6-1451cx/2ADA, BIOS 8.15 02/05/2013 RIP: 0010:assfail+0x23/0x28 [xfs] RSP: 0018:ffffc9000aacb890 EFLAGS: 00010202 RAX: 0000000000000000 RBX: ffffc9000aacbcc8 RCX: 0000000000000000 RDX: 00000000ffffffc0 RSI: 000000000000000a RDI: ffffffffc09e7dcd RBP: ffffc9000aacbc80 R08: ffff8881fdf17d50 R09: 0000000000000000 R10: 000000000000000a R11: f000000000000000 R12: 0000000000000000 R13: ffff88820c7ed000 R14: 0000000000000001 R15: ffffc9000aacb980 FS: 00007f185b955700(0000) GS:ffff8881fdf00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f7f6ef43000 CR3: 000000020de38002 CR4: 00000000001706e0 Call Trace: xchk_ag_read_headers+0xda/0x100 [xfs] xchk_ag_init+0x15/0x40 [xfs] xchk_btree_check_block_owner+0x76/0x180 [xfs] xchk_btree_get_block+0xd0/0x140 [xfs] xchk_btree+0x32e/0x440 [xfs] xchk_bmap_btree+0xd4/0x140 [xfs] xchk_bmap+0x1eb/0x3c0 [xfs] xfs_scrub_metadata+0x227/0x4c0 [xfs] xfs_ioc_scrub_metadata+0x50/0xc0 [xfs] xfs_file_ioctl+0x90c/0xc40 [xfs] __x64_sys_ioctl+0x83/0xc0 do_syscall_64+0x3b/0xc0
The unusual handling of errors while initializing struct xchk_ag is the root cause here. Since the beginning of xfs_scrub, the goal of xchk_ag_read_headers has been to read all three AG header buffers and attach them both to the xchk_ag structure and the scrub transaction. Corruption errors on any of the three headers doesn't necessarily trigger an immediate return to userspace, because xfs_scrub can also tell us to /fix/ the problem.
In other words, it's possible for the xchk_ag init functions to return an error code and a partially filled out structure so that scrub can use however much information it managed to pull. Before 5.15, it was sufficient to cancel (or commit) the scrub transaction on the way out of the scrub code to release the buffers.
Ccommit 48c6615cc557 added a reference to the perag structure to struct xchk_ag. Since perag structures are not attached to transactions like buffers are, this adds the requirement that the perag ref be released explicitly. The scrub teardown function xchk_teardown was amended to do this for the xchk_ag embedded in struct xfs_scrub.
Unfortunately, I forgot that certain parts of the scrub code probe multiple AGs and therefore handle the initialization and cleanup on their own. Specifically, the bmbt scrubber will initialize it long enough to cross-reference AG metadata for btree blocks and for the extent mappings in the bmbt.
If one of the AG headers is corrupt, the init function returns with a live perag structure reference and some of the AG header buffers. If an error occurs, the cross referencing will be noted as XCORRUPTion and skipped, but the main scrub process will move on to the next record. It is now necessary to release the perag reference before we try to analyze something from a different AG, or else we'll trip over the assertion noted above.
Fixes: 48c6615cc557 ("xfs: grab active perag ref when reading AG headers") Reported-by: kernel test robot oliver.sang@intel.com Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Chandan Babu R chandanrlinux@gmail.com Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/scrub/bmap.c | 3 ++- fs/xfs/scrub/btree.c | 3 ++- fs/xfs/scrub/fscounters.c | 2 +- fs/xfs/scrub/inode.c | 3 ++- 4 files changed, 7 insertions(+), 4 deletions(-)
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index e68d5d6a6e6b..694af54e83c5 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -263,7 +263,7 @@ xchk_bmap_iextent_xref( error = xchk_ag_init(info->sc, agno, &info->sc->sa); if (!xchk_fblock_process_error(info->sc, info->whichfork, irec->br_startoff, &error)) - return; + goto out_free;
xchk_xref_is_used_space(info->sc, agbno, len); xchk_xref_is_not_inode_chunk(info->sc, agbno, len); @@ -283,6 +283,7 @@ xchk_bmap_iextent_xref( break; }
+out_free: xchk_ag_free(info->sc, &info->sc->sa); }
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index 8c81761e52ab..043984245a6d 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -376,7 +376,7 @@ xchk_btree_check_block_owner( error = xchk_ag_init(bs->sc, agno, &bs->sc->sa); if (!xchk_btree_xref_process_error(bs->sc, bs->cur, level, &error)) - return error; + goto out_free; }
xchk_xref_is_used_space(bs->sc, agbno, 1); @@ -392,6 +392,7 @@ xchk_btree_check_block_owner( if (!bs->sc->sa.rmap_cur && btnum == XFS_BTNUM_RMAP) bs->cur = NULL;
+out_free: if (init_sa) xchk_ag_free(bs->sc, &bs->sc->sa);
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index e6389bcf20cf..812530b92452 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -157,7 +157,7 @@ xchk_fscount_btreeblks(
error = xchk_ag_init(sc, agno, &sc->sa); if (error) - return error; + goto out_free;
error = xfs_btree_count_blocks(sc->sa.bno_cur, &blocks); if (error) diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index bb25ff1b770d..147c443a7242 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -525,7 +525,7 @@ xchk_inode_xref(
error = xchk_ag_init(sc, agno, &sc->sa); if (!xchk_xref_process_error(sc, agno, agbno, &error)) - return; + goto out_free;
xchk_xref_is_used_space(sc, agbno, 1); xchk_inode_xref_finobt(sc, ino); @@ -533,6 +533,7 @@ xchk_inode_xref( xchk_xref_is_not_shared(sc, agbno, 1); xchk_inode_xref_bmap(sc, dip);
+out_free: xchk_ag_free(sc, &sc->sa); }
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.14-rc4 commit 72a048c1056a72e37ea2ee34cc73d8c6d6cb4290 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
While prototyping a free space defragmentation tool, I observed an unexpected IO error while running a sequence of commands that can be recreated by the following sequence of commands:
fallocate: Input/output error
I then scraped this (abbreviated) stack trace from dmesg:
WARNING: CPU: 0 PID: 30788 at fs/iomap/buffered-io.c:577 iomap_write_begin+0x376/0x450 CPU: 0 PID: 30788 Comm: xfs_io Not tainted 5.14.0-rc6-xfsx #rc6 5ef57b62a900814b3e4d885c755e9014541c8732 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-1ubuntu1.1 04/01/2014 RIP: 0010:iomap_write_begin+0x376/0x450 RSP: 0018:ffffc90000c0fc20 EFLAGS: 00010297 RAX: 0000000000000001 RBX: ffffc90000c0fd10 RCX: 0000000000001000 RDX: ffffc90000c0fc54 RSI: 000000000000000c RDI: 000000000000000c RBP: ffff888005d5dbd8 R08: 0000000000102000 R09: ffffc90000c0fc50 R10: 0000000000b00000 R11: 0000000000101000 R12: ffffea0000336c40 R13: 0000000000001000 R14: ffffc90000c0fd10 R15: 0000000000101000 FS: 00007f4b8f62fe40(0000) GS:ffff88803ec00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000056361c554108 CR3: 000000000524e004 CR4: 00000000001706f0 Call Trace: iomap_unshare_actor+0x95/0x140 iomap_apply+0xfa/0x300 iomap_file_unshare+0x44/0x60 xfs_reflink_unshare+0x50/0x140 [xfs 61947ea9b3a73e79d747dbc1b90205e7987e4195] xfs_file_fallocate+0x27c/0x610 [xfs 61947ea9b3a73e79d747dbc1b90205e7987e4195] vfs_fallocate+0x133/0x330 __x64_sys_fallocate+0x3e/0x70 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f4b8f79140a
Looking at the iomap tracepoints, I saw this:
iomap_iter: dev 8:64 ino 0x100 pos 0 length 0 flags WRITE|0x80 (0x81) ops xfs_buffered_write_iomap_ops caller iomap_file_unshare iomap_iter_dstmap: dev 8:64 ino 0x100 bdev 8:64 addr -1 offset 0 length 131072 type DELALLOC flags SHARED iomap_iter_srcmap: dev 8:64 ino 0x100 bdev 8:64 addr 147456 offset 0 length 4096 type MAPPED flags iomap_iter: dev 8:64 ino 0x100 pos 0 length 4096 flags WRITE|0x80 (0x81) ops xfs_buffered_write_iomap_ops caller iomap_file_unshare iomap_iter_dstmap: dev 8:64 ino 0x100 bdev 8:64 addr -1 offset 4096 length 4096 type DELALLOC flags SHARED console: WARNING: CPU: 0 PID: 30788 at fs/iomap/buffered-io.c:577 iomap_write_begin+0x376/0x450
The first time funshare calls ->iomap_begin, xfs sees that the first block is shared and creates a 128k delalloc reservation in the COW fork. The delalloc reservation is returned as dstmap, and the shared block is returned as srcmap. So far so good.
funshare calls ->iomap_begin to try the second block. This time there's no srcmap (punch-alternating punched it out!) but we still have the delalloc reservation in the COW fork. Therefore, we again return the reservation as dstmap and the hole as srcmap. iomap_unshare_iter incorrectly tries to unshare the hole, which __iomap_write_begin rejects because shared regions must be fully written and therefore cannot require zeroing.
Therefore, change the buffered write iomap_begin function not to set IOMAP_F_SHARED when there isn't a source mapping to read from for the unsharing.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Chandan Babu R chandanrlinux@gmail.com Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_iomap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 9d701ce2f703..37bb6f20dfcf 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1059,11 +1059,11 @@ xfs_buffered_write_iomap_begin( error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0); if (error) return error; - } else { - xfs_trim_extent(&cmap, offset_fsb, - imap.br_startoff - offset_fsb); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED); } - return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED); + + xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, 0);
out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL);
From: Dave Chinner dchinner@redhat.com
mainline-inclusion from mainline-v5.14-rc4 commit f38a032b165d812b0ba8378a5cd237c0888ff65f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Yup, the VFS hoist broke it, and nobody noticed. Bulkstat workloads make it clear that it doesn't work as it should.
Fixes: dae2f8ed7992 ("fs: Lift XFS_IDONTCACHE to the VFS layer") Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 3 ++- fs/xfs/xfs_iops.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index ed1be02dc1c3..39af75dd2bf6 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -47,8 +47,9 @@ xfs_inode_alloc( return NULL; }
- /* VFS doesn't initialise i_mode! */ + /* VFS doesn't initialise i_mode or i_state! */ VFS_I(ip)->i_mode = 0; + VFS_I(ip)->i_state = 0;
XFS_STATS_INC(mp, vn_active); ASSERT(atomic_read(&ip->i_pincount) == 0); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index b7f7b31a77d5..6a3026e78a9b 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1328,7 +1328,7 @@ xfs_setup_inode( gfp_t gfp_mask;
inode->i_ino = ip->i_ino; - inode->i_state = I_NEW; + inode->i_state |= I_NEW;
inode_sb_list_add(inode); /* make the inode look hashed for the writeback code */
From: Rustam Kovhaev rkovhaev@gmail.com
mainline-inclusion from mainline-v5.15-rc4 commit c30a0cbd07ecc0eec7b3cd568f7b1c7bb7913f93 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
For kmalloc() allocations SLOB prepends the blocks with a 4-byte header, and it puts the size of the allocated blocks in that header. Blocks allocated with kmem_cache_alloc() allocations do not have that header.
SLOB explodes when you allocate memory with kmem_cache_alloc() and then try to free it with kfree() instead of kmem_cache_free(). SLOB will assume that there is a header when there is none, read some garbage to size variable and corrupt the adjacent objects, which eventually leads to hang or panic.
Let's make XFS work with SLOB by using proper free function.
Fixes: 9749fee83f38 ("xfs: enable the xfs_defer mechanism to process extents to free") Signed-off-by: Rustam Kovhaev rkovhaev@gmail.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_extfree_item.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 5c0395256bd1..11474770d630 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -482,7 +482,7 @@ xfs_extent_free_finish_item( free->xefi_startblock, free->xefi_blockcount, &free->xefi_oinfo, free->xefi_skip_discard); - kmem_free(free); + kmem_cache_free(xfs_bmap_free_item_zone, free); return error; }
@@ -502,7 +502,7 @@ xfs_extent_free_cancel_item( struct xfs_extent_free_item *free;
free = container_of(item, struct xfs_extent_free_item, xefi_list); - kmem_free(free); + kmem_cache_free(xfs_bmap_free_item_zone, free); }
const struct xfs_defer_op_type xfs_extent_free_defer_type = { @@ -564,7 +564,7 @@ xfs_agfl_free_finish_item( extp->ext_len = free->xefi_blockcount; efdp->efd_next_extent++;
- kmem_free(free); + kmem_cache_free(xfs_bmap_free_item_zone, free); return error; }
From: Brian Foster bfoster@redhat.com
mainline-inclusion from mainline-v5.15-rc4 commit 5ca5916b6bc93577c360c06cb7cdf71adb9b5faf category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
If writeback I/O to a COW extent fails, the COW fork blocks are punched out and the data fork blocks left alone. It is possible for COW fork blocks to overlap non-shared data fork blocks (due to cowextsz hint prealloc), however, and writeback unconditionally maps to the COW fork whenever blocks exist at the corresponding offset of the page undergoing writeback. This means it's quite possible for a COW fork extent to overlap delalloc data fork blocks, writeback to convert and map to the COW fork blocks, writeback to fail, and finally for ioend completion to cancel the COW fork blocks and leave stale data fork delalloc blocks around in the inode. The blocks are effectively stale because writeback failure also discards dirty page state.
If this occurs, it is likely to trigger assert failures, free space accounting corruption and failures in unrelated file operations. For example, a subsequent reflink attempt of the affected file to a new target file will trip over the stale delalloc in the source file and fail. Several of these issues are occasionally reproduced by generic/648, but are reproducible on demand with the right sequence of operations and timely I/O error injection.
To fix this problem, update the ioend failure path to also punch out underlying data fork delalloc blocks on I/O error. This is analogous to the writeback submission failure path in xfs_discard_page() where we might fail to map data fork delalloc blocks and consistent with the successful COW writeback completion path, which is responsible for unmapping from the data fork and remapping in COW fork blocks.
Fixes: 787eb485509f ("xfs: fix and streamline error handling in xfs_end_io") Signed-off-by: Brian Foster bfoster@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_aops.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 7ffbeb14a36b..7aa67d95c578 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -118,6 +118,7 @@ xfs_end_ioend( struct iomap_ioend *ioend) { struct xfs_inode *ip = XFS_I(ioend->io_inode); + struct xfs_mount *mp = ip->i_mount; xfs_off_t offset = ioend->io_offset; size_t size = ioend->io_size; unsigned int nofs_flag; @@ -133,18 +134,26 @@ xfs_end_ioend( /* * Just clean up the in-memory strutures if the fs has been shut down. */ - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + if (XFS_FORCED_SHUTDOWN(mp)) { error = -EIO; goto done; }
/* - * Clean up any COW blocks on an I/O error. + * Clean up all COW blocks and underlying data fork delalloc blocks on + * I/O error. The delalloc punch is required because this ioend was + * mapped to blocks in the COW fork and the associated pages are no + * longer dirty. If we don't remove delalloc blocks here, they become + * stale and can corrupt free space accounting on unmount. */ error = blk_status_to_errno(ioend->io_bio->bi_status); if (unlikely(error)) { - if (ioend->io_flags & IOMAP_F_SHARED) + if (ioend->io_flags & IOMAP_F_SHARED) { xfs_reflink_cancel_cow_range(ip, offset, size, true); + xfs_bmap_punch_delalloc_range(ip, + XFS_B_TO_FSBT(mp, offset), + XFS_B_TO_FSB(mp, size)); + } goto done; }
From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.11-rc4 commit 1aecf3734a95f3c167d1495550ca57556d33f7ec category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
While refactoring the quota code to create a function to allocate inode change transactions, I noticed that xfs_qm_vop_chown_reserve does more than just make reservations: it also *modifies* the incore counts directly to handle the owner id change for the delalloc blocks.
I then observed that the fssetxattr code continues validating input arguments after making the quota reservation but before dirtying the transaction. If the routine decides to error out, it fails to undo the accounting switch! This leads to incorrect quota reservation and failure down the line.
We can fix this by making the reservation function do only that -- for the new dquot, it reserves ondisk and delalloc blocks to the transaction, and the old dquot hangs on to its incore reservation for now. Once we actually switch the dquots, we can then update the incore reservations because we've dirtied the transaction and it's too late to turn back now.
No fixes tag because this has been broken since the start of git.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_qm.c | 92 +++++++++++++++++++------------------------------ 1 file changed, 35 insertions(+), 57 deletions(-)
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index b2a9abee8b2b..64e5da33733b 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1785,6 +1785,29 @@ xfs_qm_vop_chown( xfs_trans_mod_dquot(tp, newdq, bfield, ip->i_d.di_nblocks); xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1);
+ /* + * Back when we made quota reservations for the chown, we reserved the + * ondisk blocks + delalloc blocks with the new dquot. Now that we've + * switched the dquots, decrease the new dquot's block reservation + * (having already bumped up the real counter) so that we don't have + * any reservation to give back when we commit. + */ + xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_RES_BLKS, + -ip->i_delayed_blks); + + /* + * Give the incore reservation for delalloc blocks back to the old + * dquot. We don't normally handle delalloc quota reservations + * transactionally, so just lock the dquot and subtract from the + * reservation. Dirty the transaction because it's too late to turn + * back now. + */ + tp->t_flags |= XFS_TRANS_DIRTY; + xfs_dqlock(prevdq); + ASSERT(prevdq->q_blk.reserved >= ip->i_delayed_blks); + prevdq->q_blk.reserved -= ip->i_delayed_blks; + xfs_dqunlock(prevdq); + /* * Take an extra reference, because the inode is going to keep * this dquot pointer even after the trans_commit. @@ -1807,84 +1830,39 @@ xfs_qm_vop_chown_reserve( uint flags) { struct xfs_mount *mp = ip->i_mount; - uint64_t delblks; unsigned int blkflags; - struct xfs_dquot *udq_unres = NULL; - struct xfs_dquot *gdq_unres = NULL; - struct xfs_dquot *pdq_unres = NULL; struct xfs_dquot *udq_delblks = NULL; struct xfs_dquot *gdq_delblks = NULL; struct xfs_dquot *pdq_delblks = NULL; - int error; -
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(XFS_IS_QUOTA_RUNNING(mp));
- delblks = ip->i_delayed_blks; blkflags = XFS_IS_REALTIME_INODE(ip) ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
if (XFS_IS_UQUOTA_ON(mp) && udqp && - i_uid_read(VFS_I(ip)) != udqp->q_id) { + i_uid_read(VFS_I(ip)) != udqp->q_id) udq_delblks = udqp; - /* - * If there are delayed allocation blocks, then we have to - * unreserve those from the old dquot, and add them to the - * new dquot. - */ - if (delblks) { - ASSERT(ip->i_udquot); - udq_unres = ip->i_udquot; - } - } + if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp && - i_gid_read(VFS_I(ip)) != gdqp->q_id) { + i_gid_read(VFS_I(ip)) != gdqp->q_id) gdq_delblks = gdqp; - if (delblks) { - ASSERT(ip->i_gdquot); - gdq_unres = ip->i_gdquot; - } - }
if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp && - ip->i_d.di_projid != pdqp->q_id) { + ip->i_d.di_projid != pdqp->q_id) pdq_delblks = pdqp; - if (delblks) { - ASSERT(ip->i_pdquot); - pdq_unres = ip->i_pdquot; - } - } - - error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount, - udq_delblks, gdq_delblks, pdq_delblks, - ip->i_d.di_nblocks, 1, flags | blkflags); - if (error) - return error;
/* - * Do the delayed blks reservations/unreservations now. Since, these - * are done without the help of a transaction, if a reservation fails - * its previous reservations won't be automatically undone by trans - * code. So, we have to do it manually here. + * Reserve enough quota to handle blocks on disk and reserved for a + * delayed allocation. We'll actually transfer the delalloc + * reservation between dquots at chown time, even though that part is + * only semi-transactional. */ - if (delblks) { - /* - * Do the reservations first. Unreservation can't fail. - */ - ASSERT(udq_delblks || gdq_delblks || pdq_delblks); - ASSERT(udq_unres || gdq_unres || pdq_unres); - error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, - udq_delblks, gdq_delblks, pdq_delblks, - (xfs_qcnt_t)delblks, 0, flags | blkflags); - if (error) - return error; - xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, - udq_unres, gdq_unres, pdq_unres, - -((xfs_qcnt_t)delblks), 0, blkflags); - } - - return 0; + return xfs_trans_reserve_quota_bydquots(tp, ip->i_mount, udq_delblks, + gdq_delblks, pdq_delblks, + ip->i_d.di_nblocks + ip->i_delayed_blks, + 1, blkflags | flags); }
int
From: geruijun geruijun@huawei.com
euleros inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4MD9D?from=project-issue CVE: NA
--------------------------------
Add support of port isolation for QLogic HBA card 26
Signed-off-by: geruijun geruijun@huawei.com Signed-off-by: Hongxiang Lou louhongxiang@huawei.com Reviewed-by: Xiongfeng Wang wangxiongfeng2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/pci/quirks.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 7ec42a6ab964..54a8e390ad67 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -4890,6 +4890,8 @@ static const struct pci_dev_acs_enabled { { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_quirk_intel_spt_pch_acs }, { 0x19a2, 0x710, pci_quirk_mf_endpoint_acs }, /* Emulex BE3-R */ { 0x10df, 0x720, pci_quirk_mf_endpoint_acs }, /* Emulex Skyhawk-R */ + { 0x1077, 0x2532, pci_quirk_mf_endpoint_acs}, /* QLogic QL2562 */ + { 0x1077, 0x2261, pci_quirk_mf_endpoint_acs}, /* QLogic QL2692 */ { 0x1077, 0x2031, pci_quirk_mf_endpoint_acs}, /* QLogic QL2672 */ { 0x1077, 0x2532, pci_quirk_mf_endpoint_acs}, /* Cavium ThunderX */
From: geruijun geruijun@huawei.com
euleros inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4M0EE?from=project-issue
--------------------------------
We found the following deadlock when running xfstests generic/390 with ext4 filesystem, and simutaneously offlining/onlining the disk we tested. It will cause a deadlock whose call trace is like this:
fsstress D 0 11672 11625 0x00000080 Call Trace: ? __schedule+0x2fc/0x930 ? filename_parentat+0x10b/0x1a0 schedule+0x28/0x70 rwsem_down_read_failed+0x102/0x1c0 ? __percpu_down_read+0x93/0xb0 __percpu_down_read+0x93/0xb0 __sb_start_write+0x5f/0x70 mnt_want_write+0x20/0x50 do_renameat2+0x1f3/0x550 __x64_sys_rename+0x1c/0x20 do_syscall_64+0x5b/0x1b0 entry_SYSCALL_64_after_hwframe+0x65/0xca
The root cause is that when ext4 hits IO error due to disk being offline, it will switch itself into read-only state. When it is frozen at that moment, following thaw_super() call will not unlock percpu freeze semaphores (as the fs is read-only) causing the deadlock.
Fix the problem by tracking whether the superblock was read-only at the time we were freezing it.
Reported-and-tested-by: Shijie Luo luoshijie1@huawei.com Signed-off-by: geruijun geruijun@huawei.com Signed-off-by: Jan Kara jack@suse.cz Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/super.c | 9 ++++++++- include/linux/fs.h | 4 +++- 2 files changed, 11 insertions(+), 2 deletions(-)
diff --git a/fs/super.c b/fs/super.c index 98bb0629ee10..494bfdc6f778 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1727,10 +1727,12 @@ int freeze_super(struct super_block *sb) if (sb_rdonly(sb)) { /* Nothing to do really... */ sb->s_writers.frozen = SB_FREEZE_COMPLETE; + sb->s_writers.frozen_ro = 1; up_write(&sb->s_umount); return 0; }
+ sb->s_writers.frozen_ro = 0; sb->s_writers.frozen = SB_FREEZE_WRITE; /* Release s_umount to preserve sb_start_write -> s_umount ordering */ up_write(&sb->s_umount); @@ -1786,7 +1788,12 @@ static int thaw_super_locked(struct super_block *sb) return -EINVAL; }
- if (sb_rdonly(sb)) { + /* + * Was the fs frozen in read-only state? Note that we cannot just check + * sb_rdonly(sb) as the filesystem might have switched to read-only + * state due to internal errors or so. + */ + if (sb->s_writers.frozen_ro) { sb->s_writers.frozen = SB_UNFROZEN; goto out; } diff --git a/include/linux/fs.h b/include/linux/fs.h index 243a0987ca2b..a56184847086 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1413,7 +1413,9 @@ enum { #define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1)
struct sb_writers { - int frozen; /* Is sb frozen? */ + unsigned short frozen; /* Is sb frozen? */ + unsigned short frozen_ro; /* Was sb read-only + * when frozen? */ wait_queue_head_t wait_unfrozen; /* for get_super_thawed() */ struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; };
From: Bo Wu wubo40@huawei.com
euleros inclusion category: bugfix bugzilla: 13666, https://gitee.com/openeuler/kernel/issues/I4M9ZL?from=project-issue CVE: NA
-------------------------------------
SD5896 NP init failed, when some unused BAR is enabled. The unused BAR should be disabled to fix this problem.
Signed-off-by: geruijun geruijun@huawei.com Signed-off-by: Yeqing Peng pengyeqing@huawei.com Reviewed-by: Xiongfeng Wang wangxiongfeng2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/pci/quirks.c | 25 +++++++++++++++++++++++++ include/linux/pci_ids.h | 3 +++ 2 files changed, 28 insertions(+)
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 54a8e390ad67..a7676cdae529 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -5622,6 +5622,31 @@ static void quirk_switchtec_ntb_dma_alias(struct pci_dev *pdev) pci_iounmap(pdev, mmio); pci_disable_device(pdev); } + +static void pci_quirk_hisi_fixup_class(struct pci_dev *dev) +{ + dev->class = PCI_BASE_CLASS_NETWORK << 8; + pci_info(dev, "force hisi class type to network\n"); +} +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_HUAWEI, PCIE_DEVICE_ID_HISI_5896, + pci_quirk_hisi_fixup_class); + +static void pci_quirk_hisi_fixup_bar(struct pci_dev *dev) +{ + int i, start = 3; + + for (i = start; i < PCI_NUM_RESOURCES; i++) { + dev->resource[i].start = 0; + dev->resource[i].end = 0; + dev->resource[i].flags = 0; + } + + pci_info(dev, "force disable hisilicon np bar\n"); +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_HUAWEI, PCIE_DEVICE_ID_HISI_5896, + pci_quirk_hisi_fixup_bar); + + #define SWITCHTEC_QUIRK(vid) \ DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI, vid, \ PCI_CLASS_BRIDGE_OTHER, 8, quirk_switchtec_ntb_dma_alias) diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 635a9243cce0..f8da3669b6b0 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2570,6 +2570,9 @@
#define PCI_VENDOR_ID_HUAWEI 0x19e5
+/* Hisilicon PCIe NP devices */ +#define PCIE_DEVICE_ID_HISI_5896 0x5896 + #define PCI_VENDOR_ID_NETRONOME 0x19ee #define PCI_DEVICE_ID_NETRONOME_NFP4000 0x4000 #define PCI_DEVICE_ID_NETRONOME_NFP5000 0x5000
From: Ruijun Ge geruijun@huawei.com
hulk inclusion category: bugfix bugzilla: 13666 https://gitee.com/openeuler/kernel/issues/I4DDEL CVE: NA
-----------------------------------------------------------------------------
The following patch set the class type as 'PCI_BASE_CLASS_NETWORK'. But 'PCI_BASE_CLASS_NETWORK' is actually the higher 8 bits of the class type of a network device. We should set it as 'PCI_CLASS_NETWORK_ETHERNET'. This patch fixes it.
Fixes: ba8bc9c15d20 ("PCI: Add quirk for hisilicon NP devices 5896")
Signed-off-by: Xiongfeng Wang wangxiongfeng2@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Ruijun Ge geruijun@huawei.com Reviewed-by: Xiongfeng Wang wangxiongfeng2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/pci/quirks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index a7676cdae529..9d93e681d4c5 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -5625,7 +5625,7 @@ static void quirk_switchtec_ntb_dma_alias(struct pci_dev *pdev)
static void pci_quirk_hisi_fixup_class(struct pci_dev *dev) { - dev->class = PCI_BASE_CLASS_NETWORK << 8; + dev->class = PCI_CLASS_NETWORK_ETHERNET << 8; pci_info(dev, "force hisi class type to network\n"); } DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_HUAWEI, PCIE_DEVICE_ID_HISI_5896,
From: Yang Shen shenyang39@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4MG5G
----------------------------------------------------------------------
Enable deflate/lz77_zstd algorithm for uacce device on Kunpeng930.
Signed-off-by: Yang Shen shenyang39@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Reviewed-by: Hao Fang fanghao11@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/crypto/hisilicon/zip/zip_main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/drivers/crypto/hisilicon/zip/zip_main.c b/drivers/crypto/hisilicon/zip/zip_main.c index 873971ef9aee..9cc7d9ffc28c 100644 --- a/drivers/crypto/hisilicon/zip/zip_main.c +++ b/drivers/crypto/hisilicon/zip/zip_main.c @@ -829,7 +829,10 @@ static int hisi_zip_qm_init(struct hisi_qm *qm, struct pci_dev *pdev)
qm->pdev = pdev; qm->ver = pdev->revision; - qm->algs = "zlib\ngzip"; + if (pdev->revision >= QM_HW_V3) + qm->algs = "zlib\ngzip\ndeflate\nlz77_zstd"; + else + qm->algs = "zlib\ngzip"; qm->mode = uacce_mode; qm->sqe_size = HZIP_SQE_SIZE; qm->dev_name = hisi_zip_name;
From: Yang Shen shenyang39@huawei.com
driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4MG45
----------------------------------------------------------------------
For Kunpeng 920, the bit 0 of register 'HZIP_SGL_RUSER_32_63' stand for whether the ssid is valid. So this bit should be set as valid for sva mode.
Signed-off-by: Yang Shen shenyang39@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Reviewed-by: Hao Fang fanghao11@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/crypto/hisilicon/zip/zip_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/crypto/hisilicon/zip/zip_main.c b/drivers/crypto/hisilicon/zip/zip_main.c index 9cc7d9ffc28c..9c4fb0af5ebd 100644 --- a/drivers/crypto/hisilicon/zip/zip_main.c +++ b/drivers/crypto/hisilicon/zip/zip_main.c @@ -364,15 +364,16 @@ static int hisi_zip_set_user_domain_and_cache(struct hisi_qm *qm)
/* user domain configurations */ writel(AXUSER_BASE, base + HZIP_BD_RUSER_32_63); - writel(AXUSER_BASE, base + HZIP_SGL_RUSER_32_63); writel(AXUSER_BASE, base + HZIP_BD_WUSER_32_63);
if (qm->use_sva && qm->ver == QM_HW_V2) { writel(AXUSER_BASE | AXUSER_SSV, base + HZIP_DATA_RUSER_32_63); writel(AXUSER_BASE | AXUSER_SSV, base + HZIP_DATA_WUSER_32_63); + writel(AXUSER_BASE | AXUSER_SSV, base + HZIP_SGL_RUSER_32_63); } else { writel(AXUSER_BASE, base + HZIP_DATA_RUSER_32_63); writel(AXUSER_BASE, base + HZIP_DATA_WUSER_32_63); + writel(AXUSER_BASE, base + HZIP_SGL_RUSER_32_63); }
/* let's open all compression/decompression cores */