February 2023 - Kernel - mailweb.openeuler.org

[PATCH OLK-5.10] net: hns3: add support handling tx dhcp packets for ROH
by Ke Chen 06 Feb '23

06 Feb '23

driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I6BSMN ----------------------------------------------------------------------- For ROH distributed scenario, EID is allocated by DHCP mode. Driver needs to convert the origin MAC address to EID format, and updates the destination MAC, chaddr and client id(if exists) when transmit DHCP packets. Meantime, the chaddr field should follow the source mac address, in order to make the dhcp server reply to the right client. For the payload of dhcp packet changed, so the checksum of L4 should be calculated too. Signed-off-by: Jian Shen <shenjian15(a)huawei.com> Signed-off-by: Ke Chen <chenke54(a)huawei.com> --- .../net/ethernet/hisilicon/hns3/hns3_enet.c | 172 +++++++++++++++++- .../net/ethernet/hisilicon/hns3/hns3_enet.h | 50 +++++ .../hisilicon/hns3/hns3pf/hclge_main.c | 9 + 3 files changed, 226 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index cf79cd69c766..460f9d217a18 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -1165,6 +1165,142 @@ static void hns3_tx_spare_reclaim_cb(struct hns3_enet_ring *ring, } } +static struct hns3_dhcp_packet *hns3_get_dhcp_packet(struct sk_buff *skb, + int *dhcp_len) +{ + struct hns3_dhcp_packet *dhcp; + union l4_hdr_info l4; + int l4_payload_len; + + l4.hdr = skb_transport_header(skb); + if (l4.udp->dest != htons(HNS3_DHCP_CLIENT_PORT) || + l4.udp->source != htons(HNS3_DHCP_SERVER_PORT)) + return NULL; + + dhcp = (struct hns3_dhcp_packet *)(l4.hdr + sizeof(struct udphdr)); + l4_payload_len = ntohs(l4.udp->len) - sizeof(struct udphdr); + if (l4_payload_len < offsetof(struct hns3_dhcp_packet, options) || + dhcp->hlen != ETH_ALEN || + dhcp->cookie != htonl(HNS3_DHCP_MAGIC)) + return NULL; + + *dhcp_len = l4_payload_len; + return dhcp; +} + +static u8 *hns3_dhcp_option_scan(struct hns3_dhcp_packet *packet, + struct hns3_dhcp_opt_state *opt_state) +{ + int opt_len; + u8 *cur_opt; + + /* option bytes: [code][len][data0~data[len-1]] */ + while (opt_state->rem > 0) { + switch (opt_state->opt_ptr[DHCP_OPT_CODE]) { + /* option padding and end have no len and data byte. */ + case DHCP_OPT_PADDING: + opt_state->rem--; + opt_state->opt_ptr++; + break; + case DHCP_OPT_END: + if (DHCP_OVERLOAD_USE_FILE(opt_state->overload_flag)) { + opt_state->overload_flag |= + DHCP_OVERLOAD_FILE_USED; + opt_state->opt_ptr = packet->file; + opt_state->rem = sizeof(packet->file); + break; + } + if (DHCP_OVERLOAD_USE_SNAME(opt_state->overload_flag)) { + opt_state->overload_flag |= + DHCP_OVERLOAD_SNAME_USED; + opt_state->opt_ptr = packet->sname; + opt_state->rem = sizeof(packet->sname); + break; + } + return NULL; + default: + if (opt_state->rem <= DHCP_OPT_LEN) + return NULL; + /* opt_len includes code, len and data bytes */ + opt_len = opt_state->opt_ptr[DHCP_OPT_LEN] + + DHCP_OPT_DATA; + cur_opt = opt_state->opt_ptr; + if (opt_state->rem < opt_len) + return NULL; + + opt_state->opt_ptr += opt_len; + opt_state->rem -= opt_len; + if (cur_opt[DHCP_OPT_CODE] == DHCP_OPT_OVERLOAD) { + opt_state->overload_flag |= + cur_opt[DHCP_OPT_DATA]; + break; + } + return cur_opt; + } + } + + return NULL; +} + +static void hns3_dhcp_update_option61(struct hns3_nic_priv *priv, + struct hns3_dhcp_packet *packet, + int dhcp_len) +{ + struct hns3_dhcp_opt_state opt_state; + u8 *cur_opt; + + opt_state.opt_ptr = packet->options; + opt_state.rem = dhcp_len - offsetof(struct hns3_dhcp_packet, options); + opt_state.overload_flag = 0; + + cur_opt = hns3_dhcp_option_scan(packet, &opt_state); + while (cur_opt) { + if (cur_opt[DHCP_OPT_CODE] != DHCP_OPT_CLIENT_ID) { + cur_opt = hns3_dhcp_option_scan(packet, &opt_state); + continue; + } + if (cur_opt[DHCP_OPT_LEN] > ETH_ALEN) + ether_addr_copy(&cur_opt[DHCP_CLIENT_ID_MAC_OFT], + priv->roh_perm_mac); + break; + } +} + +static void hns3_dhcp_cal_l4_csum(struct sk_buff *skb) +{ + union l3_hdr_info l3; + union l4_hdr_info l4; + __wsum csum = 0; + int offset; + + if (skb->ip_summed == CHECKSUM_PARTIAL) + return; + + l3.hdr = skb_network_header(skb); + l4.hdr = skb_transport_header(skb); + offset = skb_transport_offset(skb); + l4.udp->check = 0; + csum = csum_partial(l4.udp, ntohs(l4.udp->len), 0); + l4.udp->check = csum_tcpudp_magic(l3.v4->saddr, l3.v4->daddr, + skb->len - offset, IPPROTO_UDP, csum); +} + +static void hns3_dhcp_packet_convert(struct hns3_nic_priv *priv, + struct sk_buff *skb, + struct hns3_dhcp_packet *dhcp, + int dhcp_len) +{ + struct ethhdr *l2hdr = eth_hdr(skb); + + if (!dhcp) + return; + + ether_addr_copy(dhcp->chaddr, l2hdr->h_source); + hns3_dhcp_update_option61(priv, dhcp, dhcp_len); + /* for l4 payload changed, need to re-calculate the csum */ + hns3_dhcp_cal_l4_csum(skb); +} + static int hns3_set_tso(struct sk_buff *skb, u32 *paylen_fdop_ol4cs, u16 *mss, u32 *type_cs_vlan_tso, u32 *send_bytes) { @@ -1716,7 +1852,20 @@ static int hns3_handle_csum_partial(struct hns3_enet_ring *ring, return 0; } -static int hns3_fill_skb_desc(struct hns3_enet_ring *ring, +static bool hns3_roh_check_udpv4(struct sk_buff *skb) +{ + union l3_hdr_info l3; + + l3.hdr = skb_network_header(skb); + if (skb->protocol != htons(ETH_P_IP) || + l3.v4->version != IP_VERSION_IPV4) + return false; + + return l3.v4->protocol == IPPROTO_UDP; +} + +static int hns3_fill_skb_desc(struct hns3_nic_priv *priv, + struct hns3_enet_ring *ring, struct sk_buff *skb, struct hns3_desc *desc, struct hns3_desc_cb *desc_cb) { @@ -1741,6 +1890,14 @@ static int hns3_fill_skb_desc(struct hns3_enet_ring *ring, hnae3_set_field(param.paylen_fdop_ol4cs, HNS3_TXD_FD_OP_M, HNS3_TXD_FD_OP_S, fd_op); + if (hns3_roh_check_udpv4(skb)) { + struct hns3_dhcp_packet *dhcp; + int dhcp_len; + + dhcp = hns3_get_dhcp_packet(skb, &dhcp_len); + hns3_dhcp_packet_convert(priv, skb, dhcp, dhcp_len); + } + /* Set txbd */ desc->tx.ol_type_vlan_len_msec = cpu_to_le32(param.ol_type_vlan_len_msec); @@ -2338,15 +2495,16 @@ static int hns3_handle_desc_filling(struct hns3_enet_ring *ring, return hns3_fill_skb_to_desc(ring, skb, DESC_TYPE_SKB); } -static int hns3_handle_skb_desc(struct hns3_enet_ring *ring, +static int hns3_handle_skb_desc(struct hns3_nic_priv *priv, + struct hns3_enet_ring *ring, struct sk_buff *skb, struct hns3_desc_cb *desc_cb, int next_to_use_head) { int ret; - ret = hns3_fill_skb_desc(ring, skb, &ring->desc[ring->next_to_use], - desc_cb); + ret = hns3_fill_skb_desc(priv, ring, skb, + &ring->desc[ring->next_to_use], desc_cb); if (unlikely(ret < 0)) goto fill_err; @@ -2395,7 +2553,7 @@ netdev_tx_t hns3_nic_net_xmit(struct sk_buff *skb, struct net_device *netdev) goto out_err_tx_ok; } - ret = hns3_handle_skb_desc(ring, skb, desc_cb, ring->next_to_use); + ret = hns3_handle_skb_desc(priv, ring, skb, desc_cb, ring->next_to_use); if (unlikely(ret <= 0)) goto out_err_tx_ok; @@ -5226,6 +5384,9 @@ static int hns3_init_mac_addr(struct net_device *netdev) return 0; } + if (is_zero_ether_addr(priv->roh_perm_mac)) + ether_addr_copy(priv->roh_perm_mac, netdev->dev_addr); + if (h->ae_algo->ops->set_mac_addr) ret = h->ae_algo->ops->set_mac_addr(h, netdev->dev_addr, true); @@ -5377,6 +5538,7 @@ static int hns3_client_init(struct hnae3_handle *handle) priv->tx_timeout_count = 0; priv->max_non_tso_bd_num = ae_dev->dev_specs.max_non_tso_bd_num; set_bit(HNS3_NIC_STATE_DOWN, &priv->state); + eth_zero_addr(priv->roh_perm_mac); handle->msg_enable = netif_msg_init(debug, DEFAULT_MSG_LEVEL); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index ccfd38b0028e..85c352fff83b 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -604,6 +604,56 @@ struct hns3_nic_priv { struct hns3_enet_coalesce rx_coal; u32 tx_copybreak; u32 rx_copybreak; + u8 roh_perm_mac[ETH_ALEN]; +}; + +#define HNS3_DHCP_SERVER_PORT 68 +#define HNS3_DHCP_CLIENT_PORT 67 +#define HNS3_DHCP_MAGIC 0x63825363 +#define DHCP_OPT_CODE 0 +#define DHCP_OPT_LEN 1 +#define DHCP_OPT_DATA 2 +#define DHCP_CLIENT_ID_LEN 7 +#define DHCP_CLIENT_ID_MAC_OFT 3 +#define DHCP_OVERLOAD_FILE 0x1 +#define DHCP_OVERLOAD_SNAME 0x2 +#define DHCP_OVERLOAD_FILE_USED 0x101 +#define DHCP_OVERLOAD_SNAME_USED 0x202 +#define DHCP_OVERLOAD_USE_FILE(x) \ + (((x) & DHCP_OVERLOAD_FILE_USED) == DHCP_OVERLOAD_FILE) +#define DHCP_OVERLOAD_USE_SNAME(x) \ + (((x) & DHCP_OVERLOAD_SNAME_USED) == DHCP_OVERLOAD_SNAME) + +enum DHCP_OPTION_CODES { + DHCP_OPT_PADDING = 0, + DHCP_OPT_OVERLOAD = 52, + DHCP_OPT_CLIENT_ID = 61, + DHCP_OPT_END = 255 +}; + +struct hns3_dhcp_packet { + u8 op; + u8 htype; + u8 hlen; + u8 hops; + u32 xid; + u16 secs; + u16 flags; + u32 ciaddr; + u32 yiaddr; + u32 siaddr_nip; + u32 gateway_nip; + u8 chaddr[16]; /* link-layer client hardware address (MAC) */ + u8 sname[64]; + u8 file[128]; + u32 cookie; /* DHCP magic bytes: 0x63825363 */ + u8 options[312]; +}; + +struct hns3_dhcp_opt_state { + u8 *opt_ptr; /* refer to current option item */ + int rem; /* remain bytes in options */ + u32 overload_flag; /* whether use file and sname field as options */ }; union l3_hdr_info { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index eea17548416b..5c8a821aa61a 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -2866,12 +2866,21 @@ static void hclge_get_fec(struct hnae3_handle *handle, u8 *fec_ability, if (fec_mode) *fec_mode = mac->fec_mode; } + +static void hclge_roh_convert_mac_addr(struct hclge_dev *hdev) +{ +#define HCLGE_ROH_EID_MASK_BYTE 3 + + memset(hdev->hw.mac.mac_addr, 0, HCLGE_ROH_EID_MASK_BYTE); +} + static int hclge_mac_init(struct hclge_dev *hdev) { struct hclge_mac *mac = &hdev->hw.mac; int ret; hclge_mac_type_init(hdev); + hclge_roh_convert_mac_addr(hdev); hdev->support_sfp_query = true; hdev->hw.mac.duplex = HCLGE_MAC_FULL; -- 2.30.0

1 0

[OLK-5.10 v2 1/3] net: Use u64_stats_fetch_begin_irq() for stats fetch.
by Wang Yufen 06 Feb '23

06 Feb '23

From: Sebastian Andrzej Siewior <bigeasy(a)linutronix.de> stable inclusion from stable-v5.10.142 commit d71a1c9fce184718d1b3a51a9e8a6e31cbbb45ce category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6D0ZE Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id… ------------------------------------------------- commit 278d3ba61563ceed3cb248383ced19e14ec7bc1f upstream. On 32bit-UP u64_stats_fetch_begin() disables only preemption. If the reader is in preemptible context and the writer side (u64_stats_update_begin*()) runs in an interrupt context (IRQ or softirq) then the writer can update the stats during the read operation. This update remains undetected. Use u64_stats_fetch_begin_irq() to ensure the stats fetch on 32bit-UP are not interrupted by a writer. 32bit-SMP remains unaffected by this change. Cc: "David S. Miller" <davem(a)davemloft.net> Cc: Catherine Sullivan <csully(a)google.com> Cc: David Awogbemila <awogbemila(a)google.com> Cc: Dimitris Michailidis <dmichail(a)fungible.com> Cc: Eric Dumazet <edumazet(a)google.com> Cc: Hans Ulli Kroll <ulli.kroll(a)googlemail.com> Cc: Jakub Kicinski <kuba(a)kernel.org> Cc: Jeroen de Borst <jeroendb(a)google.com> Cc: Johannes Berg <johannes(a)sipsolutions.net> Cc: Linus Walleij <linus.walleij(a)linaro.org> Cc: Paolo Abeni <pabeni(a)redhat.com> Cc: Simon Horman <simon.horman(a)corigine.com> Cc: linux-arm-kernel(a)lists.infradead.org Cc: linux-wireless(a)vger.kernel.org Cc: netdev(a)vger.kernel.org Cc: oss-drivers(a)corigine.com Cc: stable(a)vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior <bigeasy(a)linutronix.de> Reviewed-by: Simon Horman <simon.horman(a)corigine.com> Signed-off-by: David S. Miller <davem(a)davemloft.net> Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org> (cherry picked from commit d71a1c9fce184718d1b3a51a9e8a6e31cbbb45ce) Signed-off-by: Wang Yufen <wangyufen(a)huawei.com> Conflicts: drivers/net/ethernet/huawei/hinic/hinic_rx.c drivers/net/ethernet/huawei/hinic/hinic_tx.c Signed-off-by: Wang Yufen <wangyufen(a)huawei.com> --- drivers/net/ethernet/cortina/gemini.c | 24 +++++++++++----------- drivers/net/ethernet/google/gve/gve_ethtool.c | 16 +++++++-------- drivers/net/ethernet/google/gve/gve_main.c | 12 +++++------ drivers/net/ethernet/huawei/hinic/hinic_rx.c | 4 ++-- drivers/net/ethernet/huawei/hinic/hinic_tx.c | 4 ++-- .../net/ethernet/netronome/nfp/nfp_net_common.c | 8 ++++---- .../net/ethernet/netronome/nfp/nfp_net_ethtool.c | 8 ++++---- drivers/net/netdevsim/netdev.c | 4 ++-- net/mac80211/sta_info.c | 8 ++++---- net/mpls/af_mpls.c | 4 ++-- 10 files changed, 46 insertions(+), 46 deletions(-) diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c index 3685878..b22ea40 100644 --- a/drivers/net/ethernet/cortina/gemini.c +++ b/drivers/net/ethernet/cortina/gemini.c @@ -1920,7 +1920,7 @@ static void gmac_get_stats64(struct net_device *netdev, /* Racing with RX NAPI */ do { - start = u64_stats_fetch_begin(&port->rx_stats_syncp); + start = u64_stats_fetch_begin_irq(&port->rx_stats_syncp); stats->rx_packets = port->stats.rx_packets; stats->rx_bytes = port->stats.rx_bytes; @@ -1932,11 +1932,11 @@ static void gmac_get_stats64(struct net_device *netdev, stats->rx_crc_errors = port->stats.rx_crc_errors; stats->rx_frame_errors = port->stats.rx_frame_errors; - } while (u64_stats_fetch_retry(&port->rx_stats_syncp, start)); + } while (u64_stats_fetch_retry_irq(&port->rx_stats_syncp, start)); /* Racing with MIB and TX completion interrupts */ do { - start = u64_stats_fetch_begin(&port->ir_stats_syncp); + start = u64_stats_fetch_begin_irq(&port->ir_stats_syncp); stats->tx_errors = port->stats.tx_errors; stats->tx_packets = port->stats.tx_packets; @@ -1946,15 +1946,15 @@ static void gmac_get_stats64(struct net_device *netdev, stats->rx_missed_errors = port->stats.rx_missed_errors; stats->rx_fifo_errors = port->stats.rx_fifo_errors; - } while (u64_stats_fetch_retry(&port->ir_stats_syncp, start)); + } while (u64_stats_fetch_retry_irq(&port->ir_stats_syncp, start)); /* Racing with hard_start_xmit */ do { - start = u64_stats_fetch_begin(&port->tx_stats_syncp); + start = u64_stats_fetch_begin_irq(&port->tx_stats_syncp); stats->tx_dropped = port->stats.tx_dropped; - } while (u64_stats_fetch_retry(&port->tx_stats_syncp, start)); + } while (u64_stats_fetch_retry_irq(&port->tx_stats_syncp, start)); stats->rx_dropped += stats->rx_missed_errors; } @@ -2032,18 +2032,18 @@ static void gmac_get_ethtool_stats(struct net_device *netdev, /* Racing with MIB interrupt */ do { p = values; - start = u64_stats_fetch_begin(&port->ir_stats_syncp); + start = u64_stats_fetch_begin_irq(&port->ir_stats_syncp); for (i = 0; i < RX_STATS_NUM; i++) *p++ = port->hw_stats[i]; - } while (u64_stats_fetch_retry(&port->ir_stats_syncp, start)); + } while (u64_stats_fetch_retry_irq(&port->ir_stats_syncp, start)); values = p; /* Racing with RX NAPI */ do { p = values; - start = u64_stats_fetch_begin(&port->rx_stats_syncp); + start = u64_stats_fetch_begin_irq(&port->rx_stats_syncp); for (i = 0; i < RX_STATUS_NUM; i++) *p++ = port->rx_stats[i]; @@ -2051,13 +2051,13 @@ static void gmac_get_ethtool_stats(struct net_device *netdev, *p++ = port->rx_csum_stats[i]; *p++ = port->rx_napi_exits; - } while (u64_stats_fetch_retry(&port->rx_stats_syncp, start)); + } while (u64_stats_fetch_retry_irq(&port->rx_stats_syncp, start)); values = p; /* Racing with TX start_xmit */ do { p = values; - start = u64_stats_fetch_begin(&port->tx_stats_syncp); + start = u64_stats_fetch_begin_irq(&port->tx_stats_syncp); for (i = 0; i < TX_MAX_FRAGS; i++) { *values++ = port->tx_frag_stats[i]; @@ -2066,7 +2066,7 @@ static void gmac_get_ethtool_stats(struct net_device *netdev, *values++ = port->tx_frags_linearized; *values++ = port->tx_hw_csummed; - } while (u64_stats_fetch_retry(&port->tx_stats_syncp, start)); + } while (u64_stats_fetch_retry_irq(&port->tx_stats_syncp, start)); } static int gmac_get_ksettings(struct net_device *netdev, diff --git a/drivers/net/ethernet/google/gve/gve_ethtool.c b/drivers/net/ethernet/google/gve/gve_ethtool.c index 66f9b37..80a8c0c 100644 --- a/drivers/net/ethernet/google/gve/gve_ethtool.c +++ b/drivers/net/ethernet/google/gve/gve_ethtool.c @@ -172,14 +172,14 @@ static int gve_get_sset_count(struct net_device *netdev, int sset) struct gve_rx_ring *rx = &priv->rx[ring]; start = - u64_stats_fetch_begin(&priv->rx[ring].statss); + u64_stats_fetch_begin_irq(&priv->rx[ring].statss); tmp_rx_pkts = rx->rpackets; tmp_rx_bytes = rx->rbytes; tmp_rx_skb_alloc_fail = rx->rx_skb_alloc_fail; tmp_rx_buf_alloc_fail = rx->rx_buf_alloc_fail; tmp_rx_desc_err_dropped_pkt = rx->rx_desc_err_dropped_pkt; - } while (u64_stats_fetch_retry(&priv->rx[ring].statss, + } while (u64_stats_fetch_retry_irq(&priv->rx[ring].statss, start)); rx_pkts += tmp_rx_pkts; rx_bytes += tmp_rx_bytes; @@ -193,10 +193,10 @@ static int gve_get_sset_count(struct net_device *netdev, int sset) if (priv->tx) { do { start = - u64_stats_fetch_begin(&priv->tx[ring].statss); + u64_stats_fetch_begin_irq(&priv->tx[ring].statss); tmp_tx_pkts = priv->tx[ring].pkt_done; tmp_tx_bytes = priv->tx[ring].bytes_done; - } while (u64_stats_fetch_retry(&priv->tx[ring].statss, + } while (u64_stats_fetch_retry_irq(&priv->tx[ring].statss, start)); tx_pkts += tmp_tx_pkts; tx_bytes += tmp_tx_bytes; @@ -254,13 +254,13 @@ static int gve_get_sset_count(struct net_device *netdev, int sset) data[i++] = rx->cnt; do { start = - u64_stats_fetch_begin(&priv->rx[ring].statss); + u64_stats_fetch_begin_irq(&priv->rx[ring].statss); tmp_rx_bytes = rx->rbytes; tmp_rx_skb_alloc_fail = rx->rx_skb_alloc_fail; tmp_rx_buf_alloc_fail = rx->rx_buf_alloc_fail; tmp_rx_desc_err_dropped_pkt = rx->rx_desc_err_dropped_pkt; - } while (u64_stats_fetch_retry(&priv->rx[ring].statss, + } while (u64_stats_fetch_retry_irq(&priv->rx[ring].statss, start)); data[i++] = tmp_rx_bytes; /* rx dropped packets */ @@ -313,9 +313,9 @@ static int gve_get_sset_count(struct net_device *netdev, int sset) data[i++] = tx->done; do { start = - u64_stats_fetch_begin(&priv->tx[ring].statss); + u64_stats_fetch_begin_irq(&priv->tx[ring].statss); tmp_tx_bytes = tx->bytes_done; - } while (u64_stats_fetch_retry(&priv->tx[ring].statss, + } while (u64_stats_fetch_retry_irq(&priv->tx[ring].statss, start)); data[i++] = tmp_tx_bytes; data[i++] = tx->wake_queue; diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 6cb75bb..f0c1e6c8 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -40,10 +40,10 @@ static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s) for (ring = 0; ring < priv->rx_cfg.num_queues; ring++) { do { start = - u64_stats_fetch_begin(&priv->rx[ring].statss); + u64_stats_fetch_begin_irq(&priv->rx[ring].statss); packets = priv->rx[ring].rpackets; bytes = priv->rx[ring].rbytes; - } while (u64_stats_fetch_retry(&priv->rx[ring].statss, + } while (u64_stats_fetch_retry_irq(&priv->rx[ring].statss, start)); s->rx_packets += packets; s->rx_bytes += bytes; @@ -53,10 +53,10 @@ static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s) for (ring = 0; ring < priv->tx_cfg.num_queues; ring++) { do { start = - u64_stats_fetch_begin(&priv->tx[ring].statss); + u64_stats_fetch_begin_irq(&priv->tx[ring].statss); packets = priv->tx[ring].pkt_done; bytes = priv->tx[ring].bytes_done; - } while (u64_stats_fetch_retry(&priv->tx[ring].statss, + } while (u64_stats_fetch_retry_irq(&priv->tx[ring].statss, start)); s->tx_packets += packets; s->tx_bytes += bytes; @@ -1041,9 +1041,9 @@ void gve_handle_report_stats(struct gve_priv *priv) if (priv->tx) { for (idx = 0; idx < priv->tx_cfg.num_queues; idx++) { do { - start = u64_stats_fetch_begin(&priv->tx[idx].statss); + start = u64_stats_fetch_begin_irq(&priv->tx[idx].statss); tx_bytes = priv->tx[idx].bytes_done; - } while (u64_stats_fetch_retry(&priv->tx[idx].statss, start)); + } while (u64_stats_fetch_retry_irq(&priv->tx[idx].statss, start)); stats[stats_idx++] = (struct stats) { .stat_name = cpu_to_be32(TX_WAKE_CNT), .value = cpu_to_be64(priv->tx[idx].wake_queue), diff --git a/drivers/net/ethernet/huawei/hinic/hinic_rx.c b/drivers/net/ethernet/huawei/hinic/hinic_rx.c index 57d5d79..1b57b67 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_rx.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_rx.c @@ -375,7 +375,7 @@ void hinic_rxq_get_stats(struct hinic_rxq *rxq, u64_stats_update_begin(&stats->syncp); do { - start = u64_stats_fetch_begin(&rxq_stats->syncp); + start = u64_stats_fetch_begin_irq(&rxq_stats->syncp); stats->bytes = rxq_stats->bytes; stats->packets = rxq_stats->packets; stats->errors = rxq_stats->csum_errors + @@ -384,7 +384,7 @@ void hinic_rxq_get_stats(struct hinic_rxq *rxq, stats->other_errors = rxq_stats->other_errors; stats->dropped = rxq_stats->dropped; stats->rx_buf_empty = rxq_stats->rx_buf_empty; - } while (u64_stats_fetch_retry(&rxq_stats->syncp, start)); + } while (u64_stats_fetch_retry_irq(&rxq_stats->syncp, start)); u64_stats_update_end(&stats->syncp); } diff --git a/drivers/net/ethernet/huawei/hinic/hinic_tx.c b/drivers/net/ethernet/huawei/hinic/hinic_tx.c index 75fa344..ff37b6f 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_tx.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_tx.c @@ -61,7 +61,7 @@ void hinic_txq_get_stats(struct hinic_txq *txq, u64_stats_update_begin(&stats->syncp); do { - start = u64_stats_fetch_begin(&txq_stats->syncp); + start = u64_stats_fetch_begin_irq(&txq_stats->syncp); stats->bytes = txq_stats->bytes; stats->packets = txq_stats->packets; stats->busy = txq_stats->busy; @@ -69,7 +69,7 @@ void hinic_txq_get_stats(struct hinic_txq *txq, stats->dropped = txq_stats->dropped; stats->big_frags_pkts = txq_stats->big_frags_pkts; stats->big_udp_pkts = txq_stats->big_udp_pkts; - } while (u64_stats_fetch_retry(&txq_stats->syncp, start)); + } while (u64_stats_fetch_retry_irq(&txq_stats->syncp, start)); u64_stats_update_end(&stats->syncp); } diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index dfc1f32..5ab230aa 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -3373,21 +3373,21 @@ static void nfp_net_stat64(struct net_device *netdev, unsigned int start; do { - start = u64_stats_fetch_begin(&r_vec->rx_sync); + start = u64_stats_fetch_begin_irq(&r_vec->rx_sync); data[0] = r_vec->rx_pkts; data[1] = r_vec->rx_bytes; data[2] = r_vec->rx_drops; - } while (u64_stats_fetch_retry(&r_vec->rx_sync, start)); + } while (u64_stats_fetch_retry_irq(&r_vec->rx_sync, start)); stats->rx_packets += data[0]; stats->rx_bytes += data[1]; stats->rx_dropped += data[2]; do { - start = u64_stats_fetch_begin(&r_vec->tx_sync); + start = u64_stats_fetch_begin_irq(&r_vec->tx_sync); data[0] = r_vec->tx_pkts; data[1] = r_vec->tx_bytes; data[2] = r_vec->tx_errors; - } while (u64_stats_fetch_retry(&r_vec->tx_sync, start)); + } while (u64_stats_fetch_retry_irq(&r_vec->tx_sync, start)); stats->tx_packets += data[0]; stats->tx_bytes += data[1]; stats->tx_errors += data[2]; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c index bfcd90f..d4136d3 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c @@ -498,7 +498,7 @@ static u64 *nfp_vnic_get_sw_stats(struct net_device *netdev, u64 *data) unsigned int start; do { - start = u64_stats_fetch_begin(&nn->r_vecs[i].rx_sync); + start = u64_stats_fetch_begin_irq(&nn->r_vecs[i].rx_sync); data[0] = nn->r_vecs[i].rx_pkts; tmp[0] = nn->r_vecs[i].hw_csum_rx_ok; tmp[1] = nn->r_vecs[i].hw_csum_rx_inner_ok; @@ -506,10 +506,10 @@ static u64 *nfp_vnic_get_sw_stats(struct net_device *netdev, u64 *data) tmp[3] = nn->r_vecs[i].hw_csum_rx_error; tmp[4] = nn->r_vecs[i].rx_replace_buf_alloc_fail; tmp[5] = nn->r_vecs[i].hw_tls_rx; - } while (u64_stats_fetch_retry(&nn->r_vecs[i].rx_sync, start)); + } while (u64_stats_fetch_retry_irq(&nn->r_vecs[i].rx_sync, start)); do { - start = u64_stats_fetch_begin(&nn->r_vecs[i].tx_sync); + start = u64_stats_fetch_begin_irq(&nn->r_vecs[i].tx_sync); data[1] = nn->r_vecs[i].tx_pkts; data[2] = nn->r_vecs[i].tx_busy; tmp[6] = nn->r_vecs[i].hw_csum_tx; @@ -519,7 +519,7 @@ static u64 *nfp_vnic_get_sw_stats(struct net_device *netdev, u64 *data) tmp[10] = nn->r_vecs[i].hw_tls_tx; tmp[11] = nn->r_vecs[i].tls_tx_fallback; tmp[12] = nn->r_vecs[i].tls_tx_no_fallback; - } while (u64_stats_fetch_retry(&nn->r_vecs[i].tx_sync, start)); + } while (u64_stats_fetch_retry_irq(&nn->r_vecs[i].tx_sync, start)); data += NN_RVEC_PER_Q_STATS; diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c index ad6dbf01..4fb0638 100644 --- a/drivers/net/netdevsim/netdev.c +++ b/drivers/net/netdevsim/netdev.c @@ -67,10 +67,10 @@ static int nsim_change_mtu(struct net_device *dev, int new_mtu) unsigned int start; do { - start = u64_stats_fetch_begin(&ns->syncp); + start = u64_stats_fetch_begin_irq(&ns->syncp); stats->tx_bytes = ns->tx_bytes; stats->tx_packets = ns->tx_packets; - } while (u64_stats_fetch_retry(&ns->syncp, start)); + } while (u64_stats_fetch_retry_irq(&ns->syncp, start)); } static int diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 461c037..cee39ae 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -2175,9 +2175,9 @@ static inline u64 sta_get_tidstats_msdu(struct ieee80211_sta_rx_stats *rxstats, u64 value; do { - start = u64_stats_fetch_begin(&rxstats->syncp); + start = u64_stats_fetch_begin_irq(&rxstats->syncp); value = rxstats->msdu[tid]; - } while (u64_stats_fetch_retry(&rxstats->syncp, start)); + } while (u64_stats_fetch_retry_irq(&rxstats->syncp, start)); return value; } @@ -2241,9 +2241,9 @@ static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats) u64 value; do { - start = u64_stats_fetch_begin(&rxstats->syncp); + start = u64_stats_fetch_begin_irq(&rxstats->syncp); value = rxstats->bytes; - } while (u64_stats_fetch_retry(&rxstats->syncp, start)); + } while (u64_stats_fetch_retry_irq(&rxstats->syncp, start)); return value; } diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 9c047c1..7239814 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1078,9 +1078,9 @@ static void mpls_get_stats(struct mpls_dev *mdev, p = per_cpu_ptr(mdev->stats, i); do { - start = u64_stats_fetch_begin(&p->syncp); + start = u64_stats_fetch_begin_irq(&p->syncp); local = p->stats; - } while (u64_stats_fetch_retry(&p->syncp, start)); + } while (u64_stats_fetch_retry_irq(&p->syncp, start)); stats->rx_packets += local.rx_packets; stats->rx_bytes += local.rx_bytes; -- 1.8.3.1

1 2

[PATCH openEuler-1.0-LTS 1/3] mm: hwpoison: refactor refcount check handling
by Yongqiang Liu 04 Feb '23

04 Feb '23

From: Yang Shi <shy828301(a)gmail.com> stable inclusion from stable-v5.15.86 commit a62b1bc603a1ded739e7cf543da29a3eb93cc534 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6AR36 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id… -------------------------------- commit dd0f230a0a80ff396c7ce587f16429f2a8131344 upstream. Memory failure will report failure if the page still has extra pinned refcount other than from hwpoison after the handler is done. Actually the check is not necessary for all handlers, so move the check into specific handlers. This would make the following keeping shmem page in page cache patch easier. There may be expected extra pin for some cases, for example, when the page is dirty and in swapcache. Link: https://lkml.kernel.org/r/20211020210755.23964-5-shy828301@gmail.com Signed-off-by: Yang Shi <shy828301(a)gmail.com> Signed-off-by: Naoya Horiguchi <naoya.horiguchi(a)nec.com> Suggested-by: Naoya Horiguchi <naoya.horiguchi(a)nec.com> Cc: Hugh Dickins <hughd(a)google.com> Cc: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com> Cc: Matthew Wilcox <willy(a)infradead.org> Cc: Oscar Salvador <osalvador(a)suse.de> Cc: Peter Xu <peterx(a)redhat.com> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org> Cc: Naoya Horiguchi <naoya.horiguchi(a)linux.dev> Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org> Signed-off-by: Ze Zuo <zuoze1(a)huawei.com> Reviewed-by: Kefeng Wang <wangkefeng.wang(a)huawei.com> Signed-off-by: Yongqiang Liu <liuyongqiang13(a)huawei.com> --- mm/memory-failure.c | 93 +++++++++++++++++++++++++++++++-------------- 1 file changed, 64 insertions(+), 29 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9a816fdf812d..b653637d5a00 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -655,12 +655,44 @@ static int truncate_error_page(struct page *p, unsigned long pfn, return ret; } +struct page_state { + unsigned long mask; + unsigned long res; + enum mf_action_page_type type; + + /* Callback ->action() has to unlock the relevant page inside it. */ + int (*action)(struct page_state *ps, struct page *p); +}; + +/* + * Return true if page is still referenced by others, otherwise return + * false. + * + * The extra_pins is true when one extra refcount is expected. + */ +static bool has_extra_refcount(struct page_state *ps, struct page *p, + bool extra_pins) +{ + int count = page_count(p) - 1; + + if (extra_pins) + count -= 1; + + if (count > 0) { + pr_err("Memory failure: %#lx: %s still referenced by %d users\n", + page_to_pfn(p), action_page_types[ps->type], count); + return true; + } + + return false; +} + /* * Error hit kernel page. * Do nothing, try to be lucky and not touch this instead. For a few cases we * could be more sophisticated. */ -static int me_kernel(struct page *p, unsigned long pfn) +static int me_kernel(struct page_state *ps, struct page *p) { unlock_page(p); return MF_IGNORED; @@ -669,9 +701,9 @@ static int me_kernel(struct page *p, unsigned long pfn) /* * Page in unknown state. Do nothing. */ -static int me_unknown(struct page *p, unsigned long pfn) +static int me_unknown(struct page_state *ps, struct page *p) { - pr_err("Memory failure: %#lx: Unknown page state\n", pfn); + pr_err("Memory failure: %#lx: Unknown page state\n", page_to_pfn(p)); unlock_page(p); return MF_FAILED; } @@ -679,7 +711,7 @@ static int me_unknown(struct page *p, unsigned long pfn) /* * Clean (or cleaned) page cache page. */ -static int me_pagecache_clean(struct page *p, unsigned long pfn) +static int me_pagecache_clean(struct page_state *ps, struct page *p) { int ret; struct address_space *mapping; @@ -716,9 +748,13 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) * * Open: to take i_mutex or not for this? Right now we don't. */ - ret = truncate_error_page(p, pfn, mapping); + ret = truncate_error_page(p, page_to_pfn(p), mapping); out: unlock_page(p); + + if (has_extra_refcount(ps, p, false)) + ret = MF_FAILED; + return ret; } @@ -727,7 +763,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) * Issues: when the error hit a hole page the error is not properly * propagated. */ -static int me_pagecache_dirty(struct page *p, unsigned long pfn) +static int me_pagecache_dirty(struct page_state *ps, struct page *p) { struct address_space *mapping = page_mapping(p); @@ -771,7 +807,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn) mapping_set_error(mapping, -EIO); } - return me_pagecache_clean(p, pfn); + return me_pagecache_clean(ps, p); } /* @@ -793,9 +829,10 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn) * Clean swap cache pages can be directly isolated. A later page fault will * bring in the known good data from disk. */ -static int me_swapcache_dirty(struct page *p, unsigned long pfn) +static int me_swapcache_dirty(struct page_state *ps, struct page *p) { int ret; + bool extra_pins = false; ClearPageDirty(p); /* Trigger EIO in shmem: */ @@ -803,10 +840,17 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn) ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED; unlock_page(p); + + if (ret == MF_DELAYED) + extra_pins = true; + + if (has_extra_refcount(ps, p, extra_pins)) + ret = MF_FAILED; + return ret; } -static int me_swapcache_clean(struct page *p, unsigned long pfn) +static int me_swapcache_clean(struct page_state *ps, struct page *p) { int ret; @@ -814,6 +858,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn) ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED; unlock_page(p); + + if (has_extra_refcount(ps, p, false)) + ret = MF_FAILED; + return ret; } @@ -823,7 +871,7 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn) * - Error on hugepage is contained in hugepage unit (not in raw page unit.) * To narrow down kill region to one page, we need to break up pmd. */ -static int me_huge_page(struct page *p, unsigned long pfn) +static int me_huge_page(struct page_state *ps, struct page *p) { int res; struct page *hpage = compound_head(p); @@ -834,7 +882,7 @@ static int me_huge_page(struct page *p, unsigned long pfn) mapping = page_mapping(hpage); if (mapping) { - res = truncate_error_page(hpage, pfn, mapping); + res = truncate_error_page(hpage, page_to_pfn(p), mapping); unlock_page(hpage); } else { res = MF_FAILED; @@ -852,6 +900,9 @@ static int me_huge_page(struct page *p, unsigned long pfn) } } + if (has_extra_refcount(ps, p, false)) + res = MF_FAILED; + return res; } @@ -878,14 +929,7 @@ static int me_huge_page(struct page *p, unsigned long pfn) #define slab (1UL << PG_slab) #define reserved (1UL << PG_reserved) -static struct page_state { - unsigned long mask; - unsigned long res; - enum mf_action_page_type type; - - /* Callback ->action() has to unlock the relevant page inside it. */ - int (*action)(struct page *p, unsigned long pfn); -} error_states[] = { +static struct page_state error_states[] = { { reserved, reserved, MF_MSG_KERNEL, me_kernel }, /* * free pages are specially detected outside this table: @@ -946,19 +990,10 @@ static int page_action(struct page_state *ps, struct page *p, unsigned long pfn) { int result; - int count; /* page p should be unlocked after returning from ps->action(). */ - result = ps->action(p, pfn); + result = ps->action(ps, p); - count = page_count(p) - 1; - if (ps->action == me_swapcache_dirty && result == MF_DELAYED) - count--; - if (count > 0) { - pr_err("Memory failure: %#lx: %s still referenced by %d users\n", - pfn, action_page_types[ps->type], count); - result = MF_FAILED; - } action_result(pfn, ps->type, result); /* Could do more checks here if page looks ok */ -- 2.25.1

1 2

[PATCH openEuler-1.0-LTS 1/6] dhugetlb: backport dynamic hugetlb feature
by Yongqiang Liu 04 Feb '23

04 Feb '23

From: Liu Shixin <liushixin2(a)hauwei.com> hulk inclusion category: feature bugzilla: 46904, https://gitee.com/openeuler/kernel/issues/I6BDME CVE: NA -------------------------------- This feature has already beed supported on x86_64 and this is the origin description: Dynamic hugetlb which is based on Hugetlb, supports to be splited dynamically in a specified cgroup. We add a hugetlb_pool in a mem_cgroup to manage dynamic hugetlb for corresponding cgroup. After dynamic hugepages are allocated for a cgroup, these hugepages can be used as 1G/2M/4K pages by split/merge opreation. It is now supported on arm64. This feature will be limited to depends on ARM64_4K_PAGES and not support cont-bits hugepage. We merge the previous patches into one patch which is patch[1]. While merge the code ,we found some code can be isolated by config DYNAMIC_HUGETLB, so we add patch[2] to re-isolated them. In patch[3], we restrict the feature on mentioned limit. The patch[4] add skip of dissolve hugepage which may conflict with memory hotplug and memory failure. The patch[5] set DYNAMIC_HUGETLB to y in hulk_defconfig to enable by default. This patch includes all previous patches and the patches list is recorded in bugzilla. Signed-off-by: Liu Shixin <liushixin2(a)hauwei.com> Reviewed-by: Kefeng Wang <wangkefeng.wang(a)huawei.com> Signed-off-by: Yongqiang Liu <liuyongqiang13(a)huawei.com> --- fs/Kconfig | 9 + fs/hugetlbfs/inode.c | 4 + include/linux/gfp.h | 4 +- include/linux/hugetlb.h | 97 +++ include/linux/memcontrol.h | 15 + include/linux/page-flags.h | 3 + include/trace/events/dhugetlb.h | 123 ++++ include/trace/events/mmflags.h | 1 + kernel/cgroup/cgroup.c | 6 + mm/huge_memory.c | 16 +- mm/hugetlb.c | 1188 ++++++++++++++++++++++++++++++- mm/internal.h | 1 + mm/memcontrol.c | 391 ++++++++++ mm/page_alloc.c | 33 +- 14 files changed, 1862 insertions(+), 29 deletions(-) create mode 100644 include/trace/events/dhugetlb.h diff --git a/fs/Kconfig b/fs/Kconfig index 5921bfbebee4..e8800d8a73b3 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -211,6 +211,15 @@ config TMPFS_INODE64 If unsure, say N. +config DYNAMIC_HUGETLB + bool "Dynamic HugeTLB" + depends on HUGETLB_PAGE + depends on MEMCG + depends on CGROUP_HUGETLB + help + Dynamic hugepage are used in memcg and can be splited into small pages + automatically. The tasks in the memcg prefer to alloc dynamic hugepage. + config HUGETLBFS bool "HugeTLB file system support" depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \ diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 005e05c442c5..30a29936372c 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1164,6 +1164,8 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) * private inode. This simplifies hugetlbfs_destroy_inode. */ mpol_shared_policy_init(&p->policy, NULL); + /* Initialize hpool here in case of a quick call to destroy */ + p->hpool = get_dhugetlb_pool_from_task(current); return &p->vfs_inode; } @@ -1178,6 +1180,8 @@ static void hugetlbfs_destroy_inode(struct inode *inode) { hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy); + dhugetlb_pool_put(HUGETLBFS_I(inode)->hpool); + HUGETLBFS_I(inode)->hpool = NULL; call_rcu(&inode->i_rcu, hugetlbfs_i_callback); } diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 152cb9bdf436..74b0375d7d2b 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -501,7 +501,9 @@ static inline void arch_alloc_page(struct page *page, int order) { } struct page * __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, nodemask_t *nodemask); - +void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, + unsigned int alloc_flags); +bool free_pages_prepare(struct page *page, unsigned int order, bool check_free); static inline struct page * __alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid) { diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 2d2b06b36bd0..3a82ea9283ec 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -289,6 +289,7 @@ struct hugetlbfs_inode_info { struct shared_policy policy; struct inode vfs_inode; unsigned int seals; + struct dhugetlb_pool *hpool; }; static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) @@ -655,6 +656,102 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr #endif /* CONFIG_HUGETLB_PAGE */ +#ifdef CONFIG_DYNAMIC_HUGETLB +/* The number of small_page_pool for a dhugetlb_pool */ +#define NR_SMPOOL num_possible_cpus() +/* The max page number in a small_page_pool */ +#define MAX_SMPOOL_PAGE 1024 +/* number to move between list */ +#define BATCH_SMPOOL_PAGE (MAX_SMPOOL_PAGE >> 2) +/* We don't need to try 5 times, or we can't migrate the pages. */ +#define HPOOL_RECLAIM_RETRIES 5 + +extern struct static_key_false dhugetlb_enabled_key; +#define dhugetlb_enabled (static_branch_unlikely(&dhugetlb_enabled_key)) + +#define DEFAULT_PAGESIZE 4096 +extern rwlock_t dhugetlb_pagelist_rwlock; +struct dhugetlb_pagelist { + unsigned long count; + struct dhugetlb_pool *hpool[0]; +}; +extern struct dhugetlb_pagelist *dhugetlb_pagelist_t; + +struct split_pages { + struct list_head list; + unsigned long start_pfn; + unsigned long free_pages; +}; + +struct small_page_pool { + spinlock_t lock; + unsigned long free_pages; + long used_pages; + struct list_head head_page; +}; + +struct dhugetlb_pool { + int nid; + spinlock_t lock; + spinlock_t reserved_lock; + atomic_t refcnt; + + struct mem_cgroup *attach_memcg; + + struct list_head dhugetlb_1G_freelists; + struct list_head dhugetlb_2M_freelists; + struct list_head dhugetlb_4K_freelists; + + struct list_head split_1G_freelists; + struct list_head split_2M_freelists; + + unsigned long total_nr_pages; + + unsigned long total_reserved_1G; + unsigned long free_reserved_1G; + unsigned long mmap_reserved_1G; + unsigned long used_1G; + unsigned long free_unreserved_1G; + unsigned long nr_split_1G; + + unsigned long total_reserved_2M; + unsigned long free_reserved_2M; + unsigned long mmap_reserved_2M; + unsigned long used_2M; + unsigned long free_unreserved_2M; + unsigned long nr_split_2M; + + unsigned long free_pages; + struct small_page_pool smpool[0]; +}; + +bool dhugetlb_pool_get(struct dhugetlb_pool *hpool); +void dhugetlb_pool_put(struct dhugetlb_pool *hpool); +struct dhugetlb_pool *hpool_alloc(unsigned long nid); +int alloc_hugepage_from_hugetlb(struct dhugetlb_pool *hpool, + unsigned long nid, unsigned long size); +bool free_dhugetlb_pool(struct dhugetlb_pool *hpool); +int update_dhugetlb_pagelist(unsigned long idx, struct dhugetlb_pool *hpool); +struct dhugetlb_pool *get_dhugetlb_pool_from_dhugetlb_pagelist( + struct page *page); +struct dhugetlb_pool *get_dhugetlb_pool_from_task(struct task_struct *tsk); +bool move_pages_from_hpool_to_smpool(struct dhugetlb_pool *hpool, + struct small_page_pool *smpool); +void move_pages_from_smpool_to_hpool(struct dhugetlb_pool *hpool, + struct small_page_pool *smpool); +void dhugetlb_reserve_hugepages(struct dhugetlb_pool *hpool, + unsigned long count, bool gigantic); +#else +#define dhugetlb_enabled 0 +struct dhugetlb_pool {}; +static inline struct dhugetlb_pool *get_dhugetlb_pool_from_task( + struct task_struct *tsk) +{ + return NULL; +} +static inline void dhugetlb_pool_put(struct dhugetlb_pool *hpool) { return; } +#endif /* CONFIG_DYNAMIC_HUGETLB */ + static inline spinlock_t *huge_pte_lock(struct hstate *h, struct mm_struct *mm, pte_t *pte) { diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 4517d132d1e2..22f40d5e0e8b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -326,6 +326,7 @@ struct mem_cgroup { }; struct mem_cgroup_extension { + struct dhugetlb_pool *hpool; #ifdef CONFIG_MEMCG_QOS /* Currently support 0 and -1. * in the future it can expand to other value. @@ -1406,4 +1407,18 @@ static inline void memcg_put_cache_ids(void) #endif /* CONFIG_MEMCG_KMEM */ +#ifdef CONFIG_DYNAMIC_HUGETLB +struct dhugetlb_pool *get_dhugetlb_pool_from_memcg(struct mem_cgroup *memcg); +struct page *alloc_page_from_dhugetlb_pool(gfp_t gfp_mask); +void free_page_to_dhugetlb_pool(struct page *page); +int dhugetlb_pool_force_empty(struct mem_cgroup *memcg); +bool dhugetlb_pool_is_free(struct cgroup_subsys_state *css); +#else +static inline struct page *alloc_page_from_dhugetlb_pool(gfp_t gfp_mask) +{ + return NULL; +} +static inline void free_page_to_dhugetlb_pool(struct page *page) {} +#endif + #endif /* _LINUX_MEMCONTROL_H */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 0c5d1c4c71e6..fd6cd68e00a2 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -102,6 +102,7 @@ enum pageflags { PG_idle, #endif PG_percpu_ref, + PG_pool, __NR_PAGEFLAGS, /* Filesystems */ @@ -284,6 +285,7 @@ PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD) __PAGEFLAG(Slab, slab, PF_NO_TAIL) __PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL) PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */ +PAGEFLAG(Pool, pool, PF_NO_TAIL) /* Xen */ PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND) @@ -770,6 +772,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page) 1UL << PG_private | 1UL << PG_private_2 | \ 1UL << PG_writeback | 1UL << PG_reserved | \ 1UL << PG_slab | 1UL << PG_active | \ + 1UL << PG_pool | \ 1UL << PG_unevictable | __PG_MLOCKED) /* diff --git a/include/trace/events/dhugetlb.h b/include/trace/events/dhugetlb.h new file mode 100644 index 000000000000..20b3a54589d1 --- /dev/null +++ b/include/trace/events/dhugetlb.h @@ -0,0 +1,123 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM dhugetlb + +#if !defined(_TRACE_DHUGETLB_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_DHUGETLB_H + +#include <linux/tracepoint.h> +#include <trace/events/mmflags.h> + +#define DHUGETLB_SPLIT_1G 0x01u +#define DHUGETLB_SPLIT_2M 0x02u +#define DHUGETLB_MERGE_4K 0x04u +#define DHUGETLB_MIGRATE_4K 0x08u +#define DHUGETLB_RESV_1G 0x10u +#define DHUGETLB_UNRESV_1G 0x20u +#define DHUGETLB_RESV_2M 0x40u +#define DHUGETLB_UNRESV_2M 0x80u +#define DHUGETLB_ALLOC_1G 0x100u +#define DHUGETLB_FREE_1G 0x200u +#define DHUGETLB_ALLOC_2M 0x400u +#define DHUGETLB_FREE_2M 0x800u + +#define __def_action_names \ + {(unsigned long)DHUGETLB_SPLIT_1G, "split_1G_to_2M"}, \ + {(unsigned long)DHUGETLB_SPLIT_2M, "split_2M_to_4K"}, \ + {(unsigned long)DHUGETLB_MERGE_4K, "merge_4K_to_2M"}, \ + {(unsigned long)DHUGETLB_MIGRATE_4K, "migrate_4K_to_2M"}, \ + {(unsigned long)DHUGETLB_RESV_1G, "resv_1G_page"}, \ + {(unsigned long)DHUGETLB_UNRESV_1G, "unresv_1G_page"}, \ + {(unsigned long)DHUGETLB_RESV_2M, "resv_2M_page"}, \ + {(unsigned long)DHUGETLB_UNRESV_2M, "unresv_2M_page"}, \ + {(unsigned long)DHUGETLB_ALLOC_1G, "alloc_1G_page"}, \ + {(unsigned long)DHUGETLB_FREE_1G, "free_1G_page"}, \ + {(unsigned long)DHUGETLB_ALLOC_2M, "alloc_2M_page"}, \ + {(unsigned long)DHUGETLB_FREE_2M, "free_2M_page"} + +#define show_action(action) \ + (action) ? __print_flags(action, "", \ + __def_action_names \ + ) : "none" + +TRACE_EVENT(dhugetlb_split_merge, + + TP_PROTO(const void *hpool, struct page *page, unsigned long action), + + TP_ARGS(hpool, page, action), + + TP_STRUCT__entry( + __field( const void *, hpool ) + __field( unsigned long, pfn ) + __field( unsigned long, action ) + ), + + TP_fast_assign( + __entry->hpool = hpool; + __entry->pfn = page ? page_to_pfn(page) : -1UL; + __entry->action = action; + ), + + TP_printk("hpool=%p page=%p pfn=%lu action=%s", + __entry->hpool, + __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL, + __entry->pfn != -1UL ? __entry->pfn : 0, + show_action(__entry->action)) +); + +TRACE_EVENT(dhugetlb_acct_memory, + + TP_PROTO(const void *hpool, unsigned long count, unsigned long action), + + TP_ARGS(hpool, count, action), + + TP_STRUCT__entry( + __field( const void *, hpool ) + __field( unsigned long, count ) + __field( unsigned long, action ) + ), + + TP_fast_assign( + __entry->hpool = hpool; + __entry->count = count; + __entry->action = action; + ), + + TP_printk("hpool=%p action=%s, mmap_count=%lu", + __entry->hpool, + show_action(__entry->action), + __entry->count) +); + +TRACE_EVENT(dhugetlb_alloc_free, + + TP_PROTO(const void *hpool, struct page *page, unsigned long count, + unsigned long action), + + TP_ARGS(hpool, page, count, action), + + TP_STRUCT__entry( + __field( const void *, hpool ) + __field( unsigned long, pfn ) + __field( unsigned long, count ) + __field( unsigned long, action ) + ), + + TP_fast_assign( + __entry->hpool = hpool; + __entry->pfn = page ? page_to_pfn(page) : -1UL; + __entry->count = count; + __entry->action = action; + ), + + TP_printk("hpool=%p page=%p pfn=%lu action=%s free_count=%lu", + __entry->hpool, + __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL, + __entry->pfn != -1UL ? __entry->pfn : 0, + show_action(__entry->action), + __entry->count) +); + +#endif /* _TRACE_DHUGETLB_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index b817bf1885a0..4d06b47129f3 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -81,6 +81,7 @@ #define __def_pageflag_names \ {1UL << PG_locked, "locked" }, \ + {1UL << PG_pool, "pool" }, \ {1UL << PG_waiters, "waiters" }, \ {1UL << PG_error, "error" }, \ {1UL << PG_referenced, "referenced" }, \ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 7456882e1a0f..b01490b71f32 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -65,6 +65,7 @@ /* let's not notify more than 100 times per second */ #define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100) +bool dhugetlb_pool_is_free(struct cgroup_subsys_state *css); /* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. @@ -5280,6 +5281,11 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) if (css_has_online_children(&cgrp->self)) return -EBUSY; +#ifdef CONFIG_MEMCG + /* If we use dynamic hugetlb, make sure dhugtlb_pool is free */ + if (!dhugetlb_pool_is_free(cgrp->subsys[memory_cgrp_id])) + return -EBUSY; +#endif /* * Mark @cgrp and the associated csets dead. The former prevents * further task migration and child creation by disabling diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f8319265c1cf..484ffdbf5f45 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -396,6 +396,20 @@ static int __init hugepage_init(void) return -EINVAL; } + /* + * When we alloc some pages(order = 0), system may help us to alloc + * a page(order > 0) due to transparent hugepage. This result + * dynamic hugetlb to be skipped. Actually, using dynamic hugetlb + * means we have already optimized the program, so we should not + * use transparent hugepage in addition. + * (May result negative optimization) + */ + if (dhugetlb_enabled) { + transparent_hugepage_flags = 0; + pr_info("transparent hugepage is disabled due to confilct with dynamic hugetlb\n"); + return -EINVAL; + } + /* * hugepages can't be allocated by the buddy allocator */ @@ -2946,9 +2960,9 @@ static unsigned long deferred_split_count(struct shrinker *shrink, { struct pglist_data *pgdata = NODE_DATA(sc->nid); unsigned long *split_queue_len = &pgdata->split_queue_len; +#ifdef CONFIG_MEMCG struct mem_cgroup_extension *memcg_ext; -#ifdef CONFIG_MEMCG if (sc->memcg) { memcg_ext = container_of(sc->memcg, struct mem_cgroup_extension, memcg); split_queue_len = &memcg_ext->split_queue_len; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 495d8b5b38fc..4c8c91acd6d5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -27,6 +27,12 @@ #include <linux/jhash.h> #include <linux/mman.h> #include <linux/share_pool.h> +#include <linux/kthread.h> +#include <linux/cpuhotplug.h> +#include <linux/freezer.h> +#include <linux/delay.h> +#include <linux/migrate.h> +#include <linux/mm_inline.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -39,8 +45,14 @@ #include <linux/userfaultfd_k.h> #include <linux/page_owner.h> #include <linux/share_pool.h> +#include <linux/memblock.h> #include "internal.h" +#if (defined CONFIG_DYNAMIC_HUGETLB) && (!defined __GENKSYMS__) +#define CREATE_TRACE_POINTS +#include <trace/events/dhugetlb.h> +#endif + int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; struct hstate hstates[HUGE_MAX_HSTATE]; @@ -89,7 +101,8 @@ static inline void ClearPageHugeFreed(struct page *head) } /* Forward declaration */ -static int hugetlb_acct_memory(struct hstate *h, long delta); +static int hugetlb_acct_memory(struct hstate *h, long delta, + struct dhugetlb_pool *hpool); static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) { @@ -103,7 +116,7 @@ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) if (free) { if (spool->min_hpages != -1) hugetlb_acct_memory(spool->hstate, - -spool->min_hpages); + -spool->min_hpages, NULL); kfree(spool); } } @@ -123,7 +136,7 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, spool->hstate = h; spool->min_hpages = min_hpages; - if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) { + if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages, NULL)) { kfree(spool); return NULL; } @@ -149,13 +162,17 @@ void hugepage_put_subpool(struct hugepage_subpool *spool) * a subpool minimum size must be manitained. */ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, - long delta) + long delta, struct dhugetlb_pool *hpool) { long ret = delta; if (!spool) return ret; + /* Skip subpool when hugetlb file belongs to a hugetlb_pool */ + if (dhugetlb_enabled && hpool) + return ret; + spin_lock(&spool->lock); if (spool->max_hpages != -1) { /* maximum size accounting */ @@ -194,13 +211,17 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, * in the case where a subpool minimum size must be maintained. */ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, - long delta) + long delta, struct dhugetlb_pool *hpool) { long ret = delta; if (!spool) return delta; + /* Skip subpool when hugetlb file belongs to a hugetlb_pool */ + if (dhugetlb_enabled && hpool) + return ret; + spin_lock(&spool->lock); if (spool->max_hpages != -1) /* maximum size accounting */ @@ -594,12 +615,13 @@ void hugetlb_fix_reserve_counts(struct inode *inode) struct hugepage_subpool *spool = subpool_inode(inode); long rsv_adjust; bool reserved = false; + struct dhugetlb_pool *hpool = HUGETLBFS_I(inode)->hpool; - rsv_adjust = hugepage_subpool_get_pages(spool, 1); + rsv_adjust = hugepage_subpool_get_pages(spool, 1, hpool); if (rsv_adjust > 0) { struct hstate *h = hstate_inode(inode); - if (!hugetlb_acct_memory(h, 1)) + if (!hugetlb_acct_memory(h, 1, hpool)) reserved = true; } else if (!rsv_adjust) { reserved = true; @@ -1300,6 +1322,56 @@ static inline void ClearPageHugeTemporary(struct page *page) page[2].mapping = NULL; } +#ifdef CONFIG_DYNAMIC_HUGETLB +static void free_huge_page_to_dhugetlb_pool(struct page *page, + bool restore_reserve) +{ + struct hstate *h = page_hstate(page); + struct dhugetlb_pool *hpool; + + hpool = get_dhugetlb_pool_from_dhugetlb_pagelist(page); + if (unlikely(!hpool)) { + pr_err("dhugetlb: free error: get hpool failed\n"); + return; + } + + spin_lock(&hpool->lock); + ClearPagePool(page); + set_compound_page_dtor(page, NULL_COMPOUND_DTOR); + if (!hstate_is_gigantic(h)) { + list_add(&page->lru, &hpool->dhugetlb_2M_freelists); + hpool->free_reserved_2M++; + hpool->used_2M--; + if (restore_reserve) { + hpool->mmap_reserved_2M++; + trace_dhugetlb_acct_memory(hpool, + hpool->mmap_reserved_2M, + DHUGETLB_RESV_2M); + } + trace_dhugetlb_alloc_free(hpool, page, hpool->free_reserved_2M, + DHUGETLB_FREE_2M); + } else { + list_add(&page->lru, &hpool->dhugetlb_1G_freelists); + hpool->free_reserved_1G++; + hpool->used_1G--; + if (restore_reserve) { + hpool->mmap_reserved_1G++; + trace_dhugetlb_acct_memory(hpool, + hpool->mmap_reserved_1G, + DHUGETLB_RESV_1G); + } + trace_dhugetlb_alloc_free(hpool, page, hpool->free_reserved_1G, + DHUGETLB_FREE_1G); + } + spin_unlock(&hpool->lock); + dhugetlb_pool_put(hpool); +} +#else +void free_huge_page_to_dhugetlb_pool(struct page *page, bool restore_reserve) +{ +} +#endif + void free_huge_page(struct page *page) { /* @@ -1320,6 +1392,17 @@ void free_huge_page(struct page *page) restore_reserve = PagePrivate(page); ClearPagePrivate(page); + if (dhugetlb_enabled && PagePool(page)) { + spin_lock(&hugetlb_lock); + clear_page_huge_active(page); + list_del(&page->lru); + hugetlb_cgroup_uncharge_page(hstate_index(h), + pages_per_huge_page(h), page); + spin_unlock(&hugetlb_lock); + free_huge_page_to_dhugetlb_pool(page, restore_reserve); + return; + } + /* * If PagePrivate() was set on page, page allocation consumed a * reservation. If the page was associated with a subpool, there @@ -1335,7 +1418,7 @@ void free_huge_page(struct page *page) * after page is free. Therefore, force restore_reserve * operation. */ - if (hugepage_subpool_put_pages(spool, 1) == 0) + if (hugepage_subpool_put_pages(spool, 1, NULL) == 0) restore_reserve = true; } @@ -2211,6 +2294,81 @@ static void restore_reserve_on_error(struct hstate *h, } } +#ifdef CONFIG_DYNAMIC_HUGETLB +static struct page *__alloc_huge_page_from_dhugetlb_pool( + struct dhugetlb_pool *hpool, int idx, bool need_unreserved) +{ + unsigned long flags; + struct page *page = NULL; + + spin_lock_irqsave(&hpool->lock, flags); + if (hstate_is_gigantic(&hstates[idx]) && hpool->free_reserved_1G) { + page = list_entry(hpool->dhugetlb_1G_freelists.next, + struct page, lru); + list_del(&page->lru); + hpool->free_reserved_1G--; + hpool->used_1G++; + if (need_unreserved) { + SetPagePrivate(page); + hpool->mmap_reserved_1G--; + trace_dhugetlb_acct_memory(hpool, + hpool->mmap_reserved_1G, + DHUGETLB_UNRESV_1G); + } + trace_dhugetlb_alloc_free(hpool, page, hpool->free_reserved_1G, + DHUGETLB_ALLOC_1G); + } else if (!hstate_is_gigantic(&hstates[idx]) && + hpool->free_reserved_2M) { + page = list_entry(hpool->dhugetlb_2M_freelists.next, + struct page, lru); + list_del(&page->lru); + hpool->free_reserved_2M--; + hpool->used_2M++; + if (need_unreserved) { + SetPagePrivate(page); + hpool->mmap_reserved_2M--; + trace_dhugetlb_acct_memory(hpool, + hpool->mmap_reserved_2M, + DHUGETLB_UNRESV_2M); + } + trace_dhugetlb_alloc_free(hpool, page, hpool->free_reserved_2M, + DHUGETLB_ALLOC_2M); + } + if (page) { + INIT_LIST_HEAD(&page->lru); + set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); + set_page_refcounted(page); + SetPagePool(page); + } + spin_unlock_irqrestore(&hpool->lock, flags); + + return page; +} + +static struct page *alloc_huge_page_from_dhugetlb_pool( + struct vm_area_struct *vma, int idx, int avoid_reserve, + long gbl_chg, struct dhugetlb_pool *hpool) +{ + struct page *page; + bool need_unreserved = false; + + if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) + need_unreserved = true; + + page = __alloc_huge_page_from_dhugetlb_pool(hpool, idx, + need_unreserved); + + return page; +} +#else +static inline struct page *alloc_huge_page_from_dhugetlb_pool( + struct vm_area_struct *vma, int idx, int avoid_reserve, + long gbl_chg, struct dhugetlb_pool *hpool) +{ + return NULL; +} +#endif + struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { @@ -2221,6 +2379,8 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, long gbl_chg; int ret, idx; struct hugetlb_cgroup *h_cg; + struct dhugetlb_pool *hpool = + HUGETLBFS_I(file_inode(vma->vm_file))->hpool; idx = hstate_index(h); /* @@ -2240,7 +2400,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, * checked against any subpool limit. */ if (map_chg || avoid_reserve) { - gbl_chg = hugepage_subpool_get_pages(spool, 1); + gbl_chg = hugepage_subpool_get_pages(spool, 1, hpool); if (gbl_chg < 0) { vma_end_reservation(h, vma, addr); return ERR_PTR(-ENOSPC); @@ -2262,6 +2422,26 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, if (ret) goto out_subpool_put; + if (dhugetlb_enabled && hpool) { + page = alloc_huge_page_from_dhugetlb_pool(vma, idx, + avoid_reserve, + gbl_chg, hpool); + if (page) { + /* + * Use hugetlb_lock to manage the account of + * hugetlb cgroup. + */ + spin_lock(&hugetlb_lock); + list_add(&page->lru, &h->hugepage_activelist); + hugetlb_cgroup_commit_charge(idx, + pages_per_huge_page(hstate_vma(vma)), + h_cg, page); + spin_unlock(&hugetlb_lock); + goto out; + } + goto out_uncharge_cgroup; + } + spin_lock(&hugetlb_lock); /* * glb_chg is passed to indicate whether or not a page must be taken @@ -2284,7 +2464,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, } hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); spin_unlock(&hugetlb_lock); - +out: set_page_private(page, (unsigned long)spool); map_commit = vma_commit_reservation(h, vma, addr); @@ -2300,8 +2480,8 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, */ long rsv_adjust; - rsv_adjust = hugepage_subpool_put_pages(spool, 1); - hugetlb_acct_memory(h, -rsv_adjust); + rsv_adjust = hugepage_subpool_put_pages(spool, 1, hpool); + hugetlb_acct_memory(h, -rsv_adjust, hpool); } return page; @@ -2309,7 +2489,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); out_subpool_put: if (map_chg || avoid_reserve) - hugepage_subpool_put_pages(spool, 1); + hugepage_subpool_put_pages(spool, 1, hpool); vma_end_reservation(h, vma, addr); return ERR_PTR(-ENOSPC); } @@ -3098,6 +3278,932 @@ static void hugetlb_register_all_nodes(void) { } #endif +#ifdef CONFIG_DYNAMIC_HUGETLB +static bool enable_dhugetlb; +DEFINE_STATIC_KEY_FALSE(dhugetlb_enabled_key); +DEFINE_RWLOCK(dhugetlb_pagelist_rwlock); +struct dhugetlb_pagelist *dhugetlb_pagelist_t; + +bool dhugetlb_pool_get(struct dhugetlb_pool *hpool) +{ + if (!hpool) + return false; + + return atomic_inc_not_zero(&hpool->refcnt); +} + +void dhugetlb_pool_put(struct dhugetlb_pool *hpool) +{ + if (!dhugetlb_enabled || !hpool) + return; + + if (atomic_dec_and_test(&hpool->refcnt)) { + css_put(&hpool->attach_memcg->css); + kfree(hpool); + } +} + +struct dhugetlb_pool *hpool_alloc(unsigned long nid) +{ + int i; + struct dhugetlb_pool *hpool; + + hpool = kzalloc(sizeof(struct dhugetlb_pool) + + NR_SMPOOL * sizeof(struct small_page_pool), GFP_KERNEL); + if (!hpool) + return NULL; + + spin_lock_init(&hpool->lock); + spin_lock_init(&hpool->reserved_lock); + hpool->nid = nid; + atomic_set(&hpool->refcnt, 1); + INIT_LIST_HEAD(&hpool->dhugetlb_1G_freelists); + INIT_LIST_HEAD(&hpool->dhugetlb_2M_freelists); + INIT_LIST_HEAD(&hpool->dhugetlb_4K_freelists); + INIT_LIST_HEAD(&hpool->split_1G_freelists); + INIT_LIST_HEAD(&hpool->split_2M_freelists); + + for (i = 0; i < NR_SMPOOL; i++) { + spin_lock_init(&hpool->smpool[i].lock); + INIT_LIST_HEAD(&hpool->smpool[i].head_page); + } + + return hpool; +} + +int alloc_hugepage_from_hugetlb(struct dhugetlb_pool *hpool, + unsigned long nid, unsigned long size) +{ + int ret; + struct page *page, *next; + unsigned long idx; + unsigned long i = 0; + struct hstate *h = size_to_hstate(PUD_SIZE); + + if (!h) + return -ENOMEM; + + spin_lock(&hpool->lock); + spin_lock(&hugetlb_lock); + if (h->free_huge_pages_node[nid] < size) { + ret = -ENOMEM; + goto out_unlock; + } + + list_for_each_entry_safe(page, next, &h->hugepage_freelists[nid], lru) { + idx = page_to_pfn(page) >> (PUD_SHIFT - PAGE_SHIFT); + ret = update_dhugetlb_pagelist(idx, hpool); + if (ret) + continue; + ClearPageHugeFreed(page); + list_move_tail(&page->lru, &hpool->dhugetlb_1G_freelists); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + hpool->total_nr_pages++; + hpool->free_unreserved_1G++; + if (++i == size) + break; + } + ret = 0; +out_unlock: + spin_unlock(&hugetlb_lock); + spin_unlock(&hpool->lock); + return ret; +} + +/* + * When we assign a hugepage to dhugetlb_pool, we need to record it in + * dhugetlb_pagelist_t. In this situation, we just need read_lock because + * there is not conflit when write to dhugetlb_pagelist_t->hpool. + * + * If page's pfn is greater than dhugetlb_pagelist_t->count (which may + * occurs due to memory hotplug), we need to realloc enough memory so that + * pfn = dhugetlb_pagelist_t->count - 1 and then record it. + * In this situation, we need write_lock because while we are reallocating, + * the read request should wait. + */ +int update_dhugetlb_pagelist(unsigned long idx, struct dhugetlb_pool *hpool) +{ + read_lock(&dhugetlb_pagelist_rwlock); + if (idx >= dhugetlb_pagelist_t->count) { + unsigned long size; + struct dhugetlb_pagelist *tmp; + + read_unlock(&dhugetlb_pagelist_rwlock); + write_lock(&dhugetlb_pagelist_rwlock); + + size = sizeof(struct dhugetlb_pagelist) + + (idx + 1) * sizeof(struct dhugetlb_pool *); + tmp = krealloc(dhugetlb_pagelist_t, size, GFP_ATOMIC); + if (!tmp) { + write_unlock(&dhugetlb_pagelist_rwlock); + return -ENOMEM; + } + tmp->count = idx + 1; + dhugetlb_pagelist_t = tmp; + + write_unlock(&dhugetlb_pagelist_rwlock); + read_lock(&dhugetlb_pagelist_rwlock); + } + dhugetlb_pagelist_t->hpool[idx] = hpool; + read_unlock(&dhugetlb_pagelist_rwlock); + return 0; +} + +struct dhugetlb_pool *get_dhugetlb_pool_from_dhugetlb_pagelist( + struct page *page) +{ + struct dhugetlb_pool *hpool = NULL; + unsigned long idx = page_to_pfn(page) >> (PUD_SHIFT - PAGE_SHIFT); + + read_lock(&dhugetlb_pagelist_rwlock); + if (idx < dhugetlb_pagelist_t->count) + hpool = dhugetlb_pagelist_t->hpool[idx]; + read_unlock(&dhugetlb_pagelist_rwlock); + if (dhugetlb_pool_get(hpool)) + return hpool; + return NULL; +} + +struct dhugetlb_pool *get_dhugetlb_pool_from_task(struct task_struct *tsk) +{ + struct mem_cgroup *memcg; + struct dhugetlb_pool *hpool; + + if (!dhugetlb_enabled) + return NULL; + + rcu_read_lock(); + memcg = mem_cgroup_from_task(tsk); + rcu_read_unlock(); + + hpool = get_dhugetlb_pool_from_memcg(memcg); + + return hpool; +} + +static void add_new_huge_page_to_pool(struct dhugetlb_pool *hpool, + struct page *page, bool gigantic) +{ + lockdep_assert_held(&hpool->lock); + VM_BUG_ON_PAGE(page_mapcount(page), page); + INIT_LIST_HEAD(&page->lru); + + if (gigantic) { + prep_compound_gigantic_page(page, PUD_SHIFT - PAGE_SHIFT); + list_add_tail(&page->lru, &hpool->dhugetlb_1G_freelists); + hpool->free_unreserved_1G++; + } else { + prep_new_page(page, PMD_SHIFT - PAGE_SHIFT, __GFP_COMP, 0); + set_page_count(page, 0); + list_add_tail(&page->lru, &hpool->dhugetlb_2M_freelists); + hpool->free_unreserved_2M++; + } + set_page_private(page, 0); + page->mapping = NULL; + set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); + set_hugetlb_cgroup(page, NULL); +} + +static void free_dhugetlb_pcpool(struct dhugetlb_pool *hpool) +{ + int i; + struct small_page_pool *smpool; + + for (i = 0; i < NR_SMPOOL; i++) { + smpool = &hpool->smpool[i]; + list_splice(&smpool->head_page, &hpool->dhugetlb_4K_freelists); + smpool->free_pages = 0; + smpool->used_pages = 0; + INIT_LIST_HEAD(&smpool->head_page); + } +} + +static void __free_dhugetlb_small_page(struct dhugetlb_pool *hpool) +{ + struct page *page, *next; + struct split_pages *split_huge, *split_next; + + if (list_empty(&hpool->dhugetlb_4K_freelists)) + return; + + list_for_each_entry_safe(page, next, + &hpool->dhugetlb_4K_freelists, lru) { + list_del(&page->lru); + add_new_huge_page_to_pool(hpool, page, false); + } + + list_for_each_entry_safe(split_huge, split_next, + &hpool->split_2M_freelists, list) { + list_del(&split_huge->list); + kfree(split_huge); + hpool->nr_split_2M--; + } + + hpool->free_pages = 0; + INIT_LIST_HEAD(&hpool->dhugetlb_4K_freelists); +} + +static void free_dhugetlb_small_page(struct dhugetlb_pool *hpool) +{ + struct page *page, *next; + unsigned long nr_pages = 1 << (PMD_SHIFT - PAGE_SHIFT); + + lockdep_assert_held(&hpool->lock); + if (list_empty(&hpool->dhugetlb_4K_freelists)) + return; + + list_for_each_entry_safe(page, next, + &hpool->dhugetlb_4K_freelists, lru) { + if (page_to_pfn(page) % nr_pages != 0) + list_del(&page->lru); + } + + __free_dhugetlb_small_page(hpool); +} + +static void __free_dhugetlb_huge_page(struct dhugetlb_pool *hpool) +{ + struct page *page, *next; + struct split_pages *split_giga, *split_next; + + if (list_empty(&hpool->dhugetlb_2M_freelists)) + return; + + list_for_each_entry_safe(page, next, + &hpool->dhugetlb_2M_freelists, lru) { + list_del(&page->lru); + add_new_huge_page_to_pool(hpool, page, true); + } + list_for_each_entry_safe(split_giga, split_next, + &hpool->split_1G_freelists, list) { + list_del(&split_giga->list); + kfree(split_giga); + hpool->nr_split_1G--; + } + + hpool->total_reserved_2M = 0; + hpool->free_reserved_2M = 0; + hpool->free_unreserved_2M = 0; + INIT_LIST_HEAD(&hpool->dhugetlb_2M_freelists); +} + +static void free_dhugetlb_huge_page(struct dhugetlb_pool *hpool) +{ + struct page *page, *next; + unsigned long nr_pages = 1 << (PUD_SHIFT - PAGE_SHIFT); + unsigned long block_size = 1 << (PMD_SHIFT - PAGE_SHIFT); + int i; + + lockdep_assert_held(&hpool->lock); + if (list_empty(&hpool->dhugetlb_2M_freelists)) + return; + + list_for_each_entry_safe(page, next, + &hpool->dhugetlb_2M_freelists, lru) { + set_compound_page_dtor(page, NULL_COMPOUND_DTOR); + atomic_set(compound_mapcount_ptr(page), 0); + for (i = 1; i < block_size; i++) + clear_compound_head(&page[i]); + set_compound_order(page, 0); + __ClearPageHead(page); + if (page_to_pfn(page) % nr_pages != 0) + list_del(&page->lru); + } + __free_dhugetlb_huge_page(hpool); +} + +static int try_migrate_page(struct page *page, unsigned long nid) +{ + unsigned long pfn = page_to_pfn(page); + int ret = 0; + + LIST_HEAD(source); + + if (!pfn_valid(pfn)) + return 0; + BUG_ON(PageHuge(page) || PageTransHuge(page)); + /* + * HWPoison pages have elevated reference counts so the migration + * would fail on them. It also doesn't make any sense to migrate them + * in the first place. Still try to unmap such a page in case it is + * still mapped(e.g. current hwpoison implementation doesn't unmap + * KSM pages but keep the unmap as the catch all safety net). + */ + if (PageHWPoison(page)) { + if (WARN_ON(PageLRU(page))) + isolate_lru_page(page); + if (page_mapped(page)) + try_to_unmap(page, + TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS); + return 0; + } + + if (!get_page_unless_zero(page)) + return 0; + /* + * We can skip free pages. And we can deal with pages on + * LRU and non-lru movable pages. + */ + if (PageLRU(page)) + ret = isolate_lru_page(page); + else + ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE); + put_page(page); + if (ret) { + if (page_count(page)) + ret = -EBUSY; + return ret; + } + list_add_tail(&page->lru, &source); + if (!__PageMovable(page)) + inc_node_page_state(page, + NR_ISOLATED_ANON + page_is_file_cache(page)); + + ret = migrate_pages(&source, alloc_new_node_page, NULL, nid, + MIGRATE_SYNC_LIGHT, MR_COMPACTION); + if (ret) + putback_movable_pages(&source); + return ret; +} + +static void try_migrate_pages(struct dhugetlb_pool *hpool) +{ + int i, j; + unsigned long nr_free_pages; + struct split_pages *split_giga, *next; + unsigned int nr_pages = 1 << (PMD_SHIFT - PAGE_SHIFT); + struct page *page; + int sleep_interval = 100; /* wait for the migration */ + + spin_unlock(&hpool->lock); + for (i = NR_SMPOOL - 1; i >= 0; i--) + spin_unlock(&hpool->smpool[i].lock); + + msleep(sleep_interval); + dhugetlb_pool_force_empty(hpool->attach_memcg); + + spin_lock(&hpool->lock); + nr_free_pages = hpool->free_pages; + spin_unlock(&hpool->lock); + for (i = 0; i < NR_SMPOOL; i++) { + spin_lock(&hpool->smpool[i].lock); + nr_free_pages += hpool->smpool[i].free_pages; + spin_unlock(&hpool->smpool[i].lock); + } + + if (nr_free_pages >> HUGETLB_PAGE_ORDER < hpool->nr_split_2M) { + list_for_each_entry_safe(split_giga, next, + &hpool->split_1G_freelists, list) { + for (i = 0; i < nr_pages; i++) { + if (PageCompound(pfn_to_page( + split_giga->start_pfn + i * nr_pages))) + continue; + page = pfn_to_page(split_giga->start_pfn + + i * nr_pages); + for (j = 0; j < nr_pages; j++) { + if (PagePool(page + j)) + try_migrate_page(page + j, + hpool->nid); + } + } + } + } + + for (i = 0; i < NR_SMPOOL; i++) + spin_lock(&hpool->smpool[i].lock); + spin_lock(&hpool->lock); +} + +/* + * If there are some pages are still in use. We will try to reclaim/migrate it. + * After trying at most HPOOL_RECLAIM_RETRIES times, we may success. + * Or we will print the failed information and return false. + */ +static bool free_dhugetlb_pages(struct dhugetlb_pool *hpool) +{ + int i; + long used_pages; + int try_count = 0; + +retry: + used_pages = 0; + for (i = 0; i < NR_SMPOOL; i++) + used_pages += hpool->smpool[i].used_pages; + + if (try_count < HPOOL_RECLAIM_RETRIES && + (used_pages || hpool->used_2M || hpool->used_1G)) { + try_migrate_pages(hpool); + try_count++; + goto retry; + } + + if (used_pages) + pr_err("dhugetlb: some 4K pages not free, memcg: %s delete failed!\n", + hpool->attach_memcg->css.cgroup->kn->name); + else if (hpool->used_2M) + pr_err("dhugetlb: some 2M pages not free, memcg: %s delete failed!\n", + hpool->attach_memcg->css.cgroup->kn->name); + else if (hpool->used_1G) + pr_err("dhugetlb: some 1G pages not free, memcg: %s delete failed!\n", + hpool->attach_memcg->css.cgroup->kn->name); + else { + free_dhugetlb_pcpool(hpool); + free_dhugetlb_small_page(hpool); + free_dhugetlb_huge_page(hpool); + return true; + } + return false; +} + +static void free_back_hugetlb(struct dhugetlb_pool *hpool) +{ + int nid; + unsigned int nr_pages; + unsigned long pfn, idx; + struct page *page, *page_next, *p; + struct hstate *h = size_to_hstate(PUD_SIZE); + + if (!h) + return; + + spin_lock(&hugetlb_lock); + list_for_each_entry_safe(page, page_next, + &hpool->dhugetlb_1G_freelists, lru) { + nr_pages = 1 << huge_page_order(h); + pfn = page_to_pfn(page); + for (; nr_pages--; pfn++) { + p = pfn_to_page(pfn); + p->mapping = NULL; + } + SetPageHugeFreed(page); + set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); + nid = page_to_nid(page); + BUG_ON(nid >= MAX_NUMNODES); + list_move(&page->lru, &h->hugepage_freelists[nid]); + h->free_huge_pages_node[nid]++; + read_lock(&dhugetlb_pagelist_rwlock); + idx = page_to_pfn(page) >> (PUD_SHIFT - PAGE_SHIFT); + if (idx < dhugetlb_pagelist_t->count) + dhugetlb_pagelist_t->hpool[idx] = NULL; + read_unlock(&dhugetlb_pagelist_rwlock); + } + h->free_huge_pages += hpool->total_nr_pages; + hpool->total_nr_pages = 0; + hpool->free_unreserved_1G = 0; + hpool->free_reserved_1G = 0; + hpool->total_reserved_1G = 0; + INIT_LIST_HEAD(&hpool->dhugetlb_1G_freelists); + spin_unlock(&hugetlb_lock); +} + +bool free_dhugetlb_pool(struct dhugetlb_pool *hpool) +{ + int i; + bool ret = false; + + for (i = 0; i < NR_SMPOOL; i++) + spin_lock(&hpool->smpool[i].lock); + spin_lock(&hpool->lock); + + ret = free_dhugetlb_pages(hpool); + if (!ret) + goto out_unlock; + + free_back_hugetlb(hpool); + +out_unlock: + spin_unlock(&hpool->lock); + for (i = NR_SMPOOL - 1; i >= 0; i--) + spin_unlock(&hpool->smpool[i].lock); + + if (ret) + dhugetlb_pool_put(hpool); + return ret; +} + +static void __split_free_huge_page(struct dhugetlb_pool *hpool, + struct page *page) +{ + int i; + int order_h = PUD_SHIFT - PAGE_SHIFT; + int order_m = PMD_SHIFT - PAGE_SHIFT; + int blocks = 1 << (order_h - order_m); + struct page *p = page + 1; + + lockdep_assert_held(&hpool->lock); + set_compound_page_dtor(page, NULL_COMPOUND_DTOR); + atomic_set(compound_mapcount_ptr(page), 0); + for (i = 1; i < (1 << order_h); i++, p = mem_map_next(p, page, i)) + clear_compound_head(p); + + set_compound_order(page, 0); + __ClearPageHead(page); + + /* make it be 2M huge pages and put it to huge pool */ + for (i = 0; i < blocks; i++, page += (1 << order_m)) + add_new_huge_page_to_pool(hpool, page, false); +} + +static void __split_free_small_page(struct dhugetlb_pool *hpool, + struct page *page) +{ + int i; + int nr_pages = 1 << (PMD_SHIFT - PAGE_SHIFT); + + lockdep_assert_held(&hpool->lock); + set_compound_page_dtor(page, NULL_COMPOUND_DTOR); + set_compound_order(page, 0); + for (i = 0; i < nr_pages; i++) { + if (i != 0) { + page[i].mapping = NULL; + clear_compound_head(&page[i]); + } else + __ClearPageHead(page); + + /* + * If a hugepage is mapped in private mode, the PG_uptodate bit + * will not be cleared when the hugepage freed. Clear the + * hugepage using free_pages_prepare() here. + */ + free_pages_prepare(&page[i], 0, false); + hpool->free_pages++; + list_add_tail(&page[i].lru, &hpool->dhugetlb_4K_freelists); + } +} + +static bool split_free_huge_page(struct dhugetlb_pool *hpool) +{ + struct page *page; + struct split_pages *split_page; + + lockdep_assert_held(&hpool->lock); + + if (!hpool->free_unreserved_1G) + return false; + + split_page = kzalloc(sizeof(struct split_pages), GFP_ATOMIC); + if (!split_page) + return false; + + page = list_entry(hpool->dhugetlb_1G_freelists.next, struct page, lru); + list_del(&page->lru); + hpool->free_unreserved_1G--; + + split_page->start_pfn = page_to_pfn(page); + list_add(&split_page->list, &hpool->split_1G_freelists); + hpool->nr_split_1G++; + + trace_dhugetlb_split_merge(hpool, page, DHUGETLB_SPLIT_1G); + + __split_free_huge_page(hpool, page); + return true; +} + +static bool split_free_small_page(struct dhugetlb_pool *hpool) +{ + struct page *page; + struct split_pages *split_page; + + lockdep_assert_held(&hpool->lock); + + if (!hpool->free_unreserved_2M && !split_free_huge_page(hpool)) + return false; + + split_page = kzalloc(sizeof(struct split_pages), GFP_ATOMIC); + if (!split_page) + return false; + + page = list_entry(hpool->dhugetlb_2M_freelists.next, struct page, lru); + list_del(&page->lru); + hpool->free_unreserved_2M--; + + split_page->start_pfn = page_to_pfn(page); + list_add(&split_page->list, &hpool->split_2M_freelists); + hpool->nr_split_2M++; + + trace_dhugetlb_split_merge(hpool, page, DHUGETLB_SPLIT_2M); + + __split_free_small_page(hpool, page); + return true; +} + +bool move_pages_from_hpool_to_smpool(struct dhugetlb_pool *hpool, + struct small_page_pool *smpool) +{ + int i = 0; + struct page *page, *next; + + if (!hpool->free_pages && !split_free_small_page(hpool)) + return false; + + list_for_each_entry_safe(page, next, + &hpool->dhugetlb_4K_freelists, lru) { + list_del(&page->lru); + hpool->free_pages--; + list_add_tail(&page->lru, &smpool->head_page); + smpool->free_pages++; + if (++i == BATCH_SMPOOL_PAGE) + break; + } + return true; +} + +void move_pages_from_smpool_to_hpool(struct dhugetlb_pool *hpool, + struct small_page_pool *smpool) +{ + int i = 0; + struct page *page, *next; + + list_for_each_entry_safe(page, next, &smpool->head_page, lru) { + list_del(&page->lru); + smpool->free_pages--; + list_add(&page->lru, &hpool->dhugetlb_4K_freelists); + hpool->free_pages++; + if (++i == BATCH_SMPOOL_PAGE) + break; + } +} + +static unsigned long list_len(struct list_head *head) +{ + unsigned long len = 0; + struct page *page; + + list_for_each_entry(page, head, lru) + len++; + + return len; +} + +static void hugetlb_migrate_pages(struct dhugetlb_pool *hpool, + unsigned long count) +{ + int i, try; + struct page *page; + struct split_pages *split_huge, *split_next; + unsigned long nr_pages = 1 << (PMD_SHIFT - PAGE_SHIFT); + LIST_HEAD(wait_page_list); + + list_for_each_entry_safe(split_huge, split_next, + &hpool->split_2M_freelists, list) { + /* + * Isolate free page first because we dont want them to be + * allocated. + */ + for (i = 0; i < nr_pages; i++) { + page = pfn_to_page(split_huge->start_pfn + i); + if (!PagePool(page)) + list_move(&page->lru, &wait_page_list); + } + + for (try = 0; try < HPOOL_RECLAIM_RETRIES; try++) { + /* + * Unlock and try migration, after migration we need + * to lock back. + */ + for (i = 0; i < NR_SMPOOL; i++) + hpool->smpool[i].free_pages = + list_len(&hpool->smpool[i].head_page); + hpool->free_pages = + list_len(&hpool->dhugetlb_4K_freelists); + spin_unlock(&hpool->lock); + for (i = NR_SMPOOL - 1; i >= 0; i--) + spin_unlock(&hpool->smpool[i].lock); + + for (i = 0; i < nr_pages; i++) { + page = pfn_to_page(split_huge->start_pfn + i); + if (PagePool(page)) + try_migrate_page(page, hpool->nid); + } + for (i = 0; i < NR_SMPOOL; i++) + spin_lock(&hpool->smpool[i].lock); + spin_lock(&hpool->lock); + + /* + * Isolate free page. If all page in the split_huge + * is free, return it. + */ + split_huge->free_pages = 0; + for (i = 0; i < nr_pages; i++) { + page = pfn_to_page(split_huge->start_pfn + i); + if (!PagePool(page)) { + list_move(&page->lru, &wait_page_list); + split_huge->free_pages++; + } + } + if (split_huge->free_pages == nr_pages) + break; + } + if (split_huge->free_pages == nr_pages) { + for (i = 0; i < nr_pages; i++) { + page = pfn_to_page(split_huge->start_pfn + i); + list_del(&page->lru); + } + INIT_LIST_HEAD(&wait_page_list); + page = pfn_to_page(split_huge->start_pfn); + add_new_huge_page_to_pool(hpool, page, false); + list_del(&split_huge->list); + kfree(split_huge); + hpool->nr_split_2M--; + + trace_dhugetlb_split_merge(hpool, page, + DHUGETLB_MIGRATE_4K); + + if (--count == 0) + return; + } else { + /* Failed, put back the isolate pages */ + list_splice(&wait_page_list, + &hpool->dhugetlb_4K_freelists); + INIT_LIST_HEAD(&wait_page_list); + } + } +} + +static unsigned long merge_free_split_huge(struct dhugetlb_pool *hpool, + unsigned long count) +{ + int i; + struct page *page; + struct split_pages *split_huge, *split_next; + unsigned long nr_pages = 1 << (PMD_SHIFT - PAGE_SHIFT); + + list_for_each_entry_safe(split_huge, split_next, + &hpool->split_2M_freelists, list) { + split_huge->free_pages = 0; + for (i = 0; i < nr_pages; i++) { + page = pfn_to_page(split_huge->start_pfn + i); + if (!PagePool(page)) + split_huge->free_pages++; + } + if (split_huge->free_pages == nr_pages) { + for (i = 0; i < nr_pages; i++) { + page = pfn_to_page(split_huge->start_pfn + i); + list_del(&page->lru); + } + page = pfn_to_page(split_huge->start_pfn); + add_new_huge_page_to_pool(hpool, page, false); + list_del(&split_huge->list); + kfree(split_huge); + hpool->nr_split_2M--; + + trace_dhugetlb_split_merge(hpool, page, + DHUGETLB_MERGE_4K); + + if (--count == 0) + return 0; + } + } + return count; +} + +static void merge_free_small_page(struct dhugetlb_pool *hpool, + unsigned long count) +{ + int i; + unsigned long need_migrate; + + if (!hpool->nr_split_2M) + return; + + need_migrate = merge_free_split_huge(hpool, count); + if (need_migrate) + hugetlb_migrate_pages(hpool, need_migrate); + + for (i = 0; i < NR_SMPOOL; i++) + hpool->smpool[i].free_pages = + list_len(&hpool->smpool[i].head_page); + hpool->free_pages = list_len(&hpool->dhugetlb_4K_freelists); +} + +static void dhugetlb_collect_2M_pages(struct dhugetlb_pool *hpool, + unsigned long count) +{ + int i; + + while (hpool->free_unreserved_1G && + count > hpool->free_unreserved_2M) + split_free_huge_page(hpool); + + /* + * If we try to merge 4K pages to 2M, we need to unlock hpool->lock + * first, and then try to lock every lock in order to avoid deadlock. + */ + if (count > hpool->free_unreserved_2M) { + spin_unlock(&hpool->lock); + for (i = 0; i < NR_SMPOOL; i++) + spin_lock(&hpool->smpool[i].lock); + spin_lock(&hpool->lock); + merge_free_small_page(hpool, count - hpool->free_unreserved_2M); + for (i = NR_SMPOOL - 1; i >= 0; i--) + spin_unlock(&hpool->smpool[i].lock); + } +} + +/* + * Parameter gigantic: true means reserve 1G pages and false means reserve + * 2M pages. When we want to reserve 2M pages more than + * hpool->free_unreserved_2M, we have to try split/merge. Still, we can't + * guarantee success. + */ +void dhugetlb_reserve_hugepages(struct dhugetlb_pool *hpool, + unsigned long count, bool gigantic) +{ + unsigned long delta; + + spin_lock(&hpool->lock); + if (gigantic) { + if (count > hpool->total_reserved_1G) { + delta = min(count - hpool->total_reserved_1G, + hpool->free_unreserved_1G); + hpool->total_reserved_1G += delta; + hpool->free_reserved_1G += delta; + hpool->free_unreserved_1G -= delta; + } else { + delta = min(hpool->total_reserved_1G - count, + hpool->free_reserved_1G - + hpool->mmap_reserved_1G); + hpool->total_reserved_1G -= delta; + hpool->free_reserved_1G -= delta; + hpool->free_unreserved_1G += delta; + } + } else { + if (count > hpool->total_reserved_2M) { + delta = count - hpool->total_reserved_2M; + if (delta > hpool->free_unreserved_2M) + dhugetlb_collect_2M_pages(hpool, delta); + delta = min(count - hpool->total_reserved_2M, + hpool->free_unreserved_2M); + hpool->total_reserved_2M += delta; + hpool->free_reserved_2M += delta; + hpool->free_unreserved_2M -= delta; + } else { + delta = min(hpool->total_reserved_2M - count, + hpool->free_reserved_2M - + hpool->mmap_reserved_2M); + hpool->total_reserved_2M -= delta; + hpool->free_reserved_2M -= delta; + hpool->free_unreserved_2M += delta; + } + } + spin_unlock(&hpool->lock); +} + +static int dhugetlb_acct_memory(struct hstate *h, long delta, + struct dhugetlb_pool *hpool) +{ + int ret = -ENOMEM; + + if (delta == 0) + return 0; + + spin_lock(&hpool->lock); + if (hstate_is_gigantic(h)) { + if (delta > 0 && delta <= hpool->free_reserved_1G - + hpool->mmap_reserved_1G) { + hpool->mmap_reserved_1G += delta; + ret = 0; + trace_dhugetlb_acct_memory(hpool, + hpool->mmap_reserved_1G, + DHUGETLB_RESV_1G); + } else if (delta < 0) { + hpool->mmap_reserved_1G -= (unsigned long)(-delta); + WARN_ON(hpool->mmap_reserved_1G < 0); + ret = 0; + trace_dhugetlb_acct_memory(hpool, + hpool->mmap_reserved_1G, + DHUGETLB_UNRESV_1G); + } + } else { + if (delta > 0 && delta <= hpool->free_reserved_2M - + hpool->mmap_reserved_2M) { + hpool->mmap_reserved_2M += delta; + ret = 0; + trace_dhugetlb_acct_memory(hpool, + hpool->mmap_reserved_2M, + DHUGETLB_RESV_2M); + } else if (delta < 0) { + hpool->mmap_reserved_2M -= (unsigned long)(-delta); + WARN_ON(hpool->mmap_reserved_2M < 0); + ret = 0; + trace_dhugetlb_acct_memory(hpool, + hpool->mmap_reserved_2M, + DHUGETLB_UNRESV_2M); + } + } + spin_unlock(&hpool->lock); + + return ret; +} +#else +static int dhugetlb_acct_memory(struct hstate *h, long delta, + struct dhugetlb_pool *hpool) +{ + return 0; +} +#endif /* CONFIG_DYNAMIC_HUGETLB */ + static int __init hugetlb_init(void) { int i; @@ -3134,6 +4240,23 @@ static int __init hugetlb_init(void) hugetlb_register_all_nodes(); hugetlb_cgroup_file_init(); +#ifdef CONFIG_DYNAMIC_HUGETLB + if (enable_dhugetlb) { + unsigned long count = max(max_pfn >> (PUD_SHIFT - PAGE_SHIFT), + (unsigned long)DEFAULT_PAGESIZE); + unsigned long size = sizeof(struct dhugetlb_pagelist) + + count * sizeof(struct dhugetlb_pool *); + dhugetlb_pagelist_t = kzalloc(size, GFP_KERNEL); + if (dhugetlb_pagelist_t) { + dhugetlb_pagelist_t->count = count; + static_branch_enable(&dhugetlb_enabled_key); + pr_info("Dynamic 1G hugepage enabled\n"); + } else + pr_info("Dynamic 1G hugepage disabled due to out of memory, need %lu\n", + size); + } +#endif + #ifdef CONFIG_SMP num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus()); #else @@ -3270,6 +4393,16 @@ static int __init hugetlb_nrpages_setup(char *s) } __setup("hugepages=", hugetlb_nrpages_setup); +#ifdef CONFIG_DYNAMIC_HUGETLB +static int __init dhugetlb_setup(char *s) +{ + if (!strcmp(s, "on")) + enable_dhugetlb = true; + return 1; +} +__setup("dynamic_1G_hugepage=", dhugetlb_setup); +#endif + static int __init hugetlb_default_setup(char *s) { default_hstate_size = memparse(s, &s); @@ -3471,10 +4604,14 @@ unsigned long hugetlb_total_pages(void) return nr_total_pages; } -static int hugetlb_acct_memory(struct hstate *h, long delta) +static int hugetlb_acct_memory(struct hstate *h, long delta, + struct dhugetlb_pool *hpool) { int ret = -ENOMEM; + if (dhugetlb_enabled && hpool) + return dhugetlb_acct_memory(h, delta, hpool); + spin_lock(&hugetlb_lock); /* * When cpuset is configured, it breaks the strict hugetlb page @@ -3535,6 +4672,8 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) struct hugepage_subpool *spool = subpool_vma(vma); unsigned long reserve, start, end; long gbl_reserve; + struct dhugetlb_pool *hpool = + HUGETLBFS_I(file_inode(vma->vm_file))->hpool; if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) return; @@ -3551,8 +4690,8 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) * Decrement reserve counts. The global reserve count may be * adjusted if the subpool has a minimum size. */ - gbl_reserve = hugepage_subpool_put_pages(spool, reserve); - hugetlb_acct_memory(h, -gbl_reserve); + gbl_reserve = hugepage_subpool_put_pages(spool, reserve, hpool); + hugetlb_acct_memory(h, -gbl_reserve, hpool); } } @@ -4934,6 +6073,7 @@ int hugetlb_reserve_pages(struct inode *inode, struct hugepage_subpool *spool = subpool_inode(inode); struct resv_map *resv_map; long gbl_reserve; + struct dhugetlb_pool *hpool = HUGETLBFS_I(inode)->hpool; /* This should never happen */ if (from > to) { @@ -4986,7 +6126,7 @@ int hugetlb_reserve_pages(struct inode *inode, * the subpool has a minimum size, there may be some global * reservations already in place (gbl_reserve). */ - gbl_reserve = hugepage_subpool_get_pages(spool, chg); + gbl_reserve = hugepage_subpool_get_pages(spool, chg, hpool); if (gbl_reserve < 0) { ret = -ENOSPC; goto out_err; @@ -4996,10 +6136,10 @@ int hugetlb_reserve_pages(struct inode *inode, * Check enough hugepages are available for the reservation. * Hand the pages back to the subpool if there are not */ - ret = hugetlb_acct_memory(h, gbl_reserve); + ret = hugetlb_acct_memory(h, gbl_reserve, hpool); if (ret < 0) { /* put back original number of pages, chg */ - (void)hugepage_subpool_put_pages(spool, chg); + (void)hugepage_subpool_put_pages(spool, chg, hpool); goto out_err; } @@ -5028,8 +6168,9 @@ int hugetlb_reserve_pages(struct inode *inode, long rsv_adjust; rsv_adjust = hugepage_subpool_put_pages(spool, - chg - add); - hugetlb_acct_memory(h, -rsv_adjust); + chg - add, + hpool); + hugetlb_acct_memory(h, -rsv_adjust, hpool); } } return 0; @@ -5051,6 +6192,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long chg = 0; struct hugepage_subpool *spool = subpool_inode(inode); long gbl_reserve; + struct dhugetlb_pool *hpool = HUGETLBFS_I(inode)->hpool; /* * Since this routine can be called in the evict inode path for all @@ -5075,8 +6217,8 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end, * If the subpool has a minimum size, the number of global * reservations to be released may be adjusted. */ - gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); - hugetlb_acct_memory(h, -gbl_reserve); + gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed), hpool); + hugetlb_acct_memory(h, -gbl_reserve, hpool); return 0; } diff --git a/mm/internal.h b/mm/internal.h index 1b861446c751..deffd247b010 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -182,6 +182,7 @@ extern void __putback_isolated_page(struct page *page, unsigned int order, int mt); extern void __free_pages_core(struct page *page, unsigned int order); extern void prep_compound_page(struct page *page, unsigned int order); +extern int check_new_page(struct page *page); extern void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern int user_min_free_kbytes; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 63b91a030b02..bdc90e6fc082 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -997,6 +997,41 @@ static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void) return get_mem_cgroup_from_mm(current->mm); } +#ifdef CONFIG_DYNAMIC_HUGETLB +void free_page_to_dhugetlb_pool(struct page *page) +{ + struct dhugetlb_pool *hpool; + struct small_page_pool *smpool; + unsigned long flags; + + hpool = get_dhugetlb_pool_from_dhugetlb_pagelist(page); + if (unlikely(!hpool)) { + pr_err("dhugetlb: free error: get hpool failed\n"); + return; + } + + smpool = &hpool->smpool[smp_processor_id()]; + spin_lock_irqsave(&smpool->lock, flags); + + ClearPagePool(page); + if (!free_pages_prepare(page, 0, false)) { + SetPagePool(page); + goto out; + } + list_add(&page->lru, &smpool->head_page); + smpool->free_pages++; + smpool->used_pages--; + if (smpool->free_pages > MAX_SMPOOL_PAGE) { + spin_lock(&hpool->lock); + move_pages_from_smpool_to_hpool(hpool, smpool); + spin_unlock(&hpool->lock); + } +out: + spin_unlock_irqrestore(&smpool->lock, flags); + dhugetlb_pool_put(hpool); +} +#endif /* CONFIG_DYNAMIC_HUGETLB */ + /** * mem_cgroup_iter - iterate over memory cgroup hierarchy * @root: hierarchy root @@ -3118,6 +3153,31 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) return 0; } +#ifdef CONFIG_DYNAMIC_HUGETLB +int dhugetlb_pool_force_empty(struct mem_cgroup *memcg) +{ + lru_add_drain_all(); + + drain_all_stock(memcg); + + while (page_counter_read(&memcg->memory)) { + int progress; + + if (signal_pending(current)) + return -EINTR; + + progress = try_to_free_mem_cgroup_pages(memcg, 1, + GFP_HIGHUSER_MOVABLE, + false); + + if (!progress) { + congestion_wait(BLK_RW_ASYNC, HZ/10); + break; + } + } + return 0; +} +#endif static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, char *buf, size_t nbytes, @@ -4652,6 +4712,305 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, return ret; } +#ifdef CONFIG_DYNAMIC_HUGETLB +struct dhugetlb_pool *get_dhugetlb_pool_from_memcg(struct mem_cgroup *memcg) +{ + struct mem_cgroup_extension *memcg_ext; + + if (!memcg) + return NULL; + + memcg_ext = container_of(memcg, struct mem_cgroup_extension, memcg); + if (dhugetlb_pool_get(memcg_ext->hpool)) + return memcg_ext->hpool; + return NULL; +} + +static void set_dhugetlb_pool_to_memcg(struct mem_cgroup *memcg, + struct dhugetlb_pool *hpool) +{ + struct mem_cgroup_extension *memcg_ext; + + memcg_ext = container_of(memcg, struct mem_cgroup_extension, memcg); + + memcg_ext->hpool = hpool; +} + +static bool should_allocate_from_dhugetlb_pool(gfp_t gfp_mask) +{ + gfp_t gfp = gfp_mask & GFP_HIGHUSER_MOVABLE; + + if (current->flags & PF_KTHREAD) + return false; + + /* + * The cgroup only charges anonymous and file pages from usespage. + * some filesystem maybe has masked out the __GFP_IO | __GFP_FS + * to avoid recursive memory request. eg: loop device, xfs. + */ + if ((gfp | __GFP_IO | __GFP_FS) != GFP_HIGHUSER_MOVABLE) + return false; + + return true; +} + +static struct page *__alloc_page_from_dhugetlb_pool(void) +{ + bool ret; + struct dhugetlb_pool *hpool; + struct small_page_pool *smpool; + struct page *page = NULL; + unsigned long flags; + + hpool = get_dhugetlb_pool_from_task(current); + if (unlikely(!hpool)) + goto out; + + smpool = &hpool->smpool[smp_processor_id()]; + spin_lock_irqsave(&smpool->lock, flags); + + if (smpool->free_pages == 0) { + spin_lock(&hpool->lock); + ret = move_pages_from_hpool_to_smpool(hpool, smpool); + spin_unlock(&hpool->lock); + if (!ret) + goto unlock; + } + + page = list_entry(smpool->head_page.next, struct page, lru); + list_del(&page->lru); + smpool->free_pages--; + smpool->used_pages++; + check_new_page(page); + SetPagePool(page); +unlock: + spin_unlock_irqrestore(&smpool->lock, flags); +out: + dhugetlb_pool_put(hpool); + return page; +} + +struct page *alloc_page_from_dhugetlb_pool(gfp_t gfp_mask) +{ + struct page *page = NULL; + + if (should_allocate_from_dhugetlb_pool(gfp_mask)) + page = __alloc_page_from_dhugetlb_pool(); + + return page; +} + +static void assign_new_dhugetlb_pool(struct mem_cgroup *memcg, + unsigned long nid) +{ + struct dhugetlb_pool *hpool; + + hpool = hpool_alloc(nid); + if (!hpool) + return; + + hpool->attach_memcg = memcg; + css_get(&memcg->css); + set_dhugetlb_pool_to_memcg(memcg, hpool); +} + +static int update_dhugetlb_pool(struct mem_cgroup *memcg, + unsigned long nid, unsigned long size) +{ + int ret; + struct dhugetlb_pool *hpool = get_dhugetlb_pool_from_memcg(memcg); + + if (!hpool) { + if (memcg_has_children(memcg)) + return -EINVAL; + assign_new_dhugetlb_pool(memcg, nid); + hpool = get_dhugetlb_pool_from_memcg(memcg); + } + if (!hpool) + return -ENOMEM; + if (hpool->attach_memcg != memcg || hpool->nid != nid) { + dhugetlb_pool_put(hpool); + return -EINVAL; + } + + ret = alloc_hugepage_from_hugetlb(hpool, nid, size); + + dhugetlb_pool_put(hpool); + return ret; +} + +/* + * Test whether an process can allocate specified memory size. + * + * Input must be in format '<nid> <size>'. + * size is regarded as how many it does 1G huge page. + */ +static ssize_t memcg_write_dhugetlb(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + int ret; + unsigned long nid, size; + char *endp; + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + + if (!dhugetlb_enabled) + return -EINVAL; + + buf = strstrip(buf); + nid = memparse(buf, &endp); + if (*endp != ' ' || nid >= MAX_NUMNODES) + return -EINVAL; + + buf = endp + 1; + size = memparse(buf, &endp); + if (*endp != '\0' || size == 0) + return -EINVAL; + + ret = update_dhugetlb_pool(memcg, nid, size); + + return ret ?: nbytes; +} + +static int memcg_read_dhugetlb(struct seq_file *m, void *v) +{ + int i; + unsigned long free_pages; + long used_pages = 0; + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct dhugetlb_pool *hpool = get_dhugetlb_pool_from_memcg(memcg); + + if (!dhugetlb_enabled) + return 0; + if (!hpool) { + seq_printf(m, "Curent hierarchial have not memory pool.\n"); + return 0; + } + + for (i = 0; i < NR_SMPOOL; i++) + spin_lock(&hpool->smpool[i].lock); + spin_lock(&hpool->lock); + + free_pages = hpool->free_pages; + for (i = 0; i < NR_SMPOOL; i++) { + free_pages += hpool->smpool[i].free_pages; + used_pages += hpool->smpool[i].used_pages; + } + + seq_printf(m, "dhugetlb_total_pages %ld\n" + "1G_total_reserved_pages %ld\n" + "1G_free_reserved_pages %ld\n" + "1G_mmap_reserved_pages %ld\n" + "1G_used_pages %ld\n" + "1G_free_unreserved_pages %ld\n" + "2M_total_reserved_pages %ld\n" + "2M_free_reserved_pages %ld\n" + "2M_mmap_reserved_pages %ld\n" + "2M_used_pages %ld\n" + "2M_free_unreserved_pages %ld\n" + "4K_free_pages %ld\n" + "4K_used_pages %ld\n", + hpool->total_nr_pages, + hpool->total_reserved_1G, + hpool->free_reserved_1G, + hpool->mmap_reserved_1G, + hpool->used_1G, + hpool->free_unreserved_1G, + hpool->total_reserved_2M, + hpool->free_reserved_2M, + hpool->mmap_reserved_2M, + hpool->used_2M, + hpool->free_unreserved_2M, + free_pages, + used_pages); + + spin_unlock(&hpool->lock); + for (i = NR_SMPOOL - 1; i >= 0; i--) + spin_unlock(&hpool->smpool[i].lock); + dhugetlb_pool_put(hpool); + return 0; +} + +static int update_reserve_pages(struct kernfs_open_file *of, + char *buf, bool gigantic) +{ + unsigned long size; + char *endp; + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + struct dhugetlb_pool *hpool; + + if (!dhugetlb_enabled) + return -EINVAL; + + buf = strstrip(buf); + size = memparse(buf, &endp); + if (*endp != '\0') + return -EINVAL; + + hpool = get_dhugetlb_pool_from_memcg(memcg); + if (!hpool) + return -EINVAL; + spin_lock(&hpool->reserved_lock); + dhugetlb_reserve_hugepages(hpool, size, gigantic); + spin_unlock(&hpool->reserved_lock); + dhugetlb_pool_put(hpool); + return 0; +} + +static ssize_t dhugetlb_1G_reserve_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return update_reserve_pages(of, buf, true) ?: nbytes; +} + +static ssize_t dhugetlb_2M_reserve_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return update_reserve_pages(of, buf, false) ?: nbytes; +} + +static void dhugetlb_pool_inherits(struct mem_cgroup *memcg, + struct mem_cgroup *parent) +{ + struct dhugetlb_pool *hpool; + + hpool = get_dhugetlb_pool_from_memcg(parent); + if (!hpool) + return; + + set_dhugetlb_pool_to_memcg(memcg, hpool); + dhugetlb_pool_put(hpool); +} + +static bool dhugetlb_pool_free(struct mem_cgroup *memcg) +{ + bool ret = true; + struct dhugetlb_pool *hpool; + + hpool = get_dhugetlb_pool_from_memcg(memcg); + if (hpool && hpool->attach_memcg == memcg) + ret = free_dhugetlb_pool(hpool); + dhugetlb_pool_put(hpool); + return ret; +} + +bool dhugetlb_pool_is_free(struct cgroup_subsys_state *css) +{ + if (dhugetlb_enabled) + return dhugetlb_pool_free(mem_cgroup_from_css(css)); + return true; +} +#else +static void dhugetlb_pool_inherits(struct mem_cgroup *memcg, + struct mem_cgroup *parent) +{ +} + +bool dhugetlb_pool_is_free(struct cgroup_subsys_state *css) +{ + return true; +} +#endif /* CONFIG_DYNAMIC_HUGETLB */ + static struct cftype mem_cgroup_legacy_files[] = { { .name = "usage_in_bytes", @@ -4700,6 +5059,27 @@ static struct cftype mem_cgroup_legacy_files[] = { .write = memcg_write_event_control, .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE, }, +#ifdef CONFIG_DYNAMIC_HUGETLB + { + .name = "dhugetlb.nr_pages", + .write = memcg_write_dhugetlb, + .seq_show = memcg_read_dhugetlb, + .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE | + CFTYPE_NOT_ON_ROOT, + }, + { + .name = "dhugetlb.1G.reserved_pages", + .write = dhugetlb_1G_reserve_write, + .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE | + CFTYPE_NOT_ON_ROOT, + }, + { + .name = "dhugetlb.2M.reserved_pages", + .write = dhugetlb_2M_reserve_write, + .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE | + CFTYPE_NOT_ON_ROOT, + }, +#endif { .name = "swappiness", .read_u64 = mem_cgroup_swappiness_read, @@ -5063,6 +5443,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) return &memcg->css; } + if (dhugetlb_enabled) + dhugetlb_pool_inherits(memcg, parent); + error = memcg_online_kmem(memcg); if (error) goto fail; @@ -5681,6 +6064,14 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) if (!p) return 0; + if (dhugetlb_enabled) { + struct dhugetlb_pool *hpool = get_dhugetlb_pool_from_task(p); + + if (hpool) { + dhugetlb_pool_put(hpool); + return -EPERM; + } + } /* * We are now commited to this value whatever it is. Changes in this * tunable will only affect upcoming migrations, not the current one. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a6a2f254f61f..e722d73a3724 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1052,7 +1052,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) return ret; } -static __always_inline bool free_pages_prepare(struct page *page, +__always_inline bool free_pages_prepare(struct page *page, unsigned int order, bool check_free) { int bad = 0; @@ -2012,7 +2012,7 @@ static void check_new_page_bad(struct page *page) /* * This page is about to be returned from the page allocator */ -static inline int check_new_page(struct page *page) +inline int check_new_page(struct page *page) { if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON))) @@ -2075,8 +2075,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order, set_page_owner(page, order, gfp_flags); } -static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, - unsigned int alloc_flags) +void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, + unsigned int alloc_flags) { int i; @@ -2955,6 +2955,12 @@ void free_unref_page(struct page *page) unsigned long flags; unsigned long pfn = page_to_pfn(page); + /* Free dynamic hugetlb page */ + if (dhugetlb_enabled && PagePool(page)) { + free_page_to_dhugetlb_pool(page); + return; + } + if (!free_unref_page_prepare(page, pfn)) return; @@ -2972,6 +2978,16 @@ void free_unref_page_list(struct list_head *list) unsigned long flags, pfn; int batch_count = 0; + /* Free dynamic hugetlb pages */ + if (dhugetlb_enabled) { + list_for_each_entry_safe(page, next, list, lru) { + if (PagePool(page)) { + list_del(&page->lru); + free_page_to_dhugetlb_pool(page); + } + } + } + /* Prepare pages for freeing */ list_for_each_entry_safe(page, next, list, lru) { pfn = page_to_pfn(page); @@ -4785,6 +4801,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, finalise_ac(gfp_mask, &ac); + /* Dynamic hugetlb allocation attemp */ + if (dhugetlb_enabled && likely(order == 0)) { + page = alloc_page_from_dhugetlb_pool(gfp_mask); + if (page) { + prep_new_page(page, order, gfp_mask, alloc_flags); + goto out; + } + } + /* First allocation attempt */ page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); if (likely(page)) -- 2.25.1

1 5

[OLK-5.10 1/3] net: Use u64_stats_fetch_begin_irq() for stats fetch.
by Wang Yufen 03 Feb '23

03 Feb '23

From: Sebastian Andrzej Siewior <bigeasy(a)linutronix.de> stable inclusion from stable-v5.10.142 commit d71a1c9fce184718d1b3a51a9e8a6e31cbbb45ce category: bugfix bugzilla: 188217 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id… ------------------------------------------------- commit 278d3ba61563ceed3cb248383ced19e14ec7bc1f upstream. On 32bit-UP u64_stats_fetch_begin() disables only preemption. If the reader is in preemptible context and the writer side (u64_stats_update_begin*()) runs in an interrupt context (IRQ or softirq) then the writer can update the stats during the read operation. This update remains undetected. Use u64_stats_fetch_begin_irq() to ensure the stats fetch on 32bit-UP are not interrupted by a writer. 32bit-SMP remains unaffected by this change. Cc: "David S. Miller" <davem(a)davemloft.net> Cc: Catherine Sullivan <csully(a)google.com> Cc: David Awogbemila <awogbemila(a)google.com> Cc: Dimitris Michailidis <dmichail(a)fungible.com> Cc: Eric Dumazet <edumazet(a)google.com> Cc: Hans Ulli Kroll <ulli.kroll(a)googlemail.com> Cc: Jakub Kicinski <kuba(a)kernel.org> Cc: Jeroen de Borst <jeroendb(a)google.com> Cc: Johannes Berg <johannes(a)sipsolutions.net> Cc: Linus Walleij <linus.walleij(a)linaro.org> Cc: Paolo Abeni <pabeni(a)redhat.com> Cc: Simon Horman <simon.horman(a)corigine.com> Cc: linux-arm-kernel(a)lists.infradead.org Cc: linux-wireless(a)vger.kernel.org Cc: netdev(a)vger.kernel.org Cc: oss-drivers(a)corigine.com Cc: stable(a)vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior <bigeasy(a)linutronix.de> Reviewed-by: Simon Horman <simon.horman(a)corigine.com> Signed-off-by: David S. Miller <davem(a)davemloft.net> Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org> (cherry picked from commit d71a1c9fce184718d1b3a51a9e8a6e31cbbb45ce) Signed-off-by: Wang Yufen <wangyufen(a)huawei.com> Conflicts: drivers/net/ethernet/huawei/hinic/hinic_rx.c drivers/net/ethernet/huawei/hinic/hinic_tx.c Signed-off-by: Wang Yufen <wangyufen(a)huawei.com> --- drivers/net/ethernet/cortina/gemini.c | 24 +++++++++++----------- drivers/net/ethernet/google/gve/gve_ethtool.c | 16 +++++++-------- drivers/net/ethernet/google/gve/gve_main.c | 12 +++++------ drivers/net/ethernet/huawei/hinic/hinic_rx.c | 4 ++-- drivers/net/ethernet/huawei/hinic/hinic_tx.c | 4 ++-- .../net/ethernet/netronome/nfp/nfp_net_common.c | 8 ++++---- .../net/ethernet/netronome/nfp/nfp_net_ethtool.c | 8 ++++---- drivers/net/netdevsim/netdev.c | 4 ++-- net/mac80211/sta_info.c | 8 ++++---- net/mpls/af_mpls.c | 4 ++-- 10 files changed, 46 insertions(+), 46 deletions(-) diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c index 3685878..b22ea40 100644 --- a/drivers/net/ethernet/cortina/gemini.c +++ b/drivers/net/ethernet/cortina/gemini.c @@ -1920,7 +1920,7 @@ static void gmac_get_stats64(struct net_device *netdev, /* Racing with RX NAPI */ do { - start = u64_stats_fetch_begin(&port->rx_stats_syncp); + start = u64_stats_fetch_begin_irq(&port->rx_stats_syncp); stats->rx_packets = port->stats.rx_packets; stats->rx_bytes = port->stats.rx_bytes; @@ -1932,11 +1932,11 @@ static void gmac_get_stats64(struct net_device *netdev, stats->rx_crc_errors = port->stats.rx_crc_errors; stats->rx_frame_errors = port->stats.rx_frame_errors; - } while (u64_stats_fetch_retry(&port->rx_stats_syncp, start)); + } while (u64_stats_fetch_retry_irq(&port->rx_stats_syncp, start)); /* Racing with MIB and TX completion interrupts */ do { - start = u64_stats_fetch_begin(&port->ir_stats_syncp); + start = u64_stats_fetch_begin_irq(&port->ir_stats_syncp); stats->tx_errors = port->stats.tx_errors; stats->tx_packets = port->stats.tx_packets; @@ -1946,15 +1946,15 @@ static void gmac_get_stats64(struct net_device *netdev, stats->rx_missed_errors = port->stats.rx_missed_errors; stats->rx_fifo_errors = port->stats.rx_fifo_errors; - } while (u64_stats_fetch_retry(&port->ir_stats_syncp, start)); + } while (u64_stats_fetch_retry_irq(&port->ir_stats_syncp, start)); /* Racing with hard_start_xmit */ do { - start = u64_stats_fetch_begin(&port->tx_stats_syncp); + start = u64_stats_fetch_begin_irq(&port->tx_stats_syncp); stats->tx_dropped = port->stats.tx_dropped; - } while (u64_stats_fetch_retry(&port->tx_stats_syncp, start)); + } while (u64_stats_fetch_retry_irq(&port->tx_stats_syncp, start)); stats->rx_dropped += stats->rx_missed_errors; } @@ -2032,18 +2032,18 @@ static void gmac_get_ethtool_stats(struct net_device *netdev, /* Racing with MIB interrupt */ do { p = values; - start = u64_stats_fetch_begin(&port->ir_stats_syncp); + start = u64_stats_fetch_begin_irq(&port->ir_stats_syncp); for (i = 0; i < RX_STATS_NUM; i++) *p++ = port->hw_stats[i]; - } while (u64_stats_fetch_retry(&port->ir_stats_syncp, start)); + } while (u64_stats_fetch_retry_irq(&port->ir_stats_syncp, start)); values = p; /* Racing with RX NAPI */ do { p = values; - start = u64_stats_fetch_begin(&port->rx_stats_syncp); + start = u64_stats_fetch_begin_irq(&port->rx_stats_syncp); for (i = 0; i < RX_STATUS_NUM; i++) *p++ = port->rx_stats[i]; @@ -2051,13 +2051,13 @@ static void gmac_get_ethtool_stats(struct net_device *netdev, *p++ = port->rx_csum_stats[i]; *p++ = port->rx_napi_exits; - } while (u64_stats_fetch_retry(&port->rx_stats_syncp, start)); + } while (u64_stats_fetch_retry_irq(&port->rx_stats_syncp, start)); values = p; /* Racing with TX start_xmit */ do { p = values; - start = u64_stats_fetch_begin(&port->tx_stats_syncp); + start = u64_stats_fetch_begin_irq(&port->tx_stats_syncp); for (i = 0; i < TX_MAX_FRAGS; i++) { *values++ = port->tx_frag_stats[i]; @@ -2066,7 +2066,7 @@ static void gmac_get_ethtool_stats(struct net_device *netdev, *values++ = port->tx_frags_linearized; *values++ = port->tx_hw_csummed; - } while (u64_stats_fetch_retry(&port->tx_stats_syncp, start)); + } while (u64_stats_fetch_retry_irq(&port->tx_stats_syncp, start)); } static int gmac_get_ksettings(struct net_device *netdev, diff --git a/drivers/net/ethernet/google/gve/gve_ethtool.c b/drivers/net/ethernet/google/gve/gve_ethtool.c index 66f9b37..80a8c0c 100644 --- a/drivers/net/ethernet/google/gve/gve_ethtool.c +++ b/drivers/net/ethernet/google/gve/gve_ethtool.c @@ -172,14 +172,14 @@ static int gve_get_sset_count(struct net_device *netdev, int sset) struct gve_rx_ring *rx = &priv->rx[ring]; start = - u64_stats_fetch_begin(&priv->rx[ring].statss); + u64_stats_fetch_begin_irq(&priv->rx[ring].statss); tmp_rx_pkts = rx->rpackets; tmp_rx_bytes = rx->rbytes; tmp_rx_skb_alloc_fail = rx->rx_skb_alloc_fail; tmp_rx_buf_alloc_fail = rx->rx_buf_alloc_fail; tmp_rx_desc_err_dropped_pkt = rx->rx_desc_err_dropped_pkt; - } while (u64_stats_fetch_retry(&priv->rx[ring].statss, + } while (u64_stats_fetch_retry_irq(&priv->rx[ring].statss, start)); rx_pkts += tmp_rx_pkts; rx_bytes += tmp_rx_bytes; @@ -193,10 +193,10 @@ static int gve_get_sset_count(struct net_device *netdev, int sset) if (priv->tx) { do { start = - u64_stats_fetch_begin(&priv->tx[ring].statss); + u64_stats_fetch_begin_irq(&priv->tx[ring].statss); tmp_tx_pkts = priv->tx[ring].pkt_done; tmp_tx_bytes = priv->tx[ring].bytes_done; - } while (u64_stats_fetch_retry(&priv->tx[ring].statss, + } while (u64_stats_fetch_retry_irq(&priv->tx[ring].statss, start)); tx_pkts += tmp_tx_pkts; tx_bytes += tmp_tx_bytes; @@ -254,13 +254,13 @@ static int gve_get_sset_count(struct net_device *netdev, int sset) data[i++] = rx->cnt; do { start = - u64_stats_fetch_begin(&priv->rx[ring].statss); + u64_stats_fetch_begin_irq(&priv->rx[ring].statss); tmp_rx_bytes = rx->rbytes; tmp_rx_skb_alloc_fail = rx->rx_skb_alloc_fail; tmp_rx_buf_alloc_fail = rx->rx_buf_alloc_fail; tmp_rx_desc_err_dropped_pkt = rx->rx_desc_err_dropped_pkt; - } while (u64_stats_fetch_retry(&priv->rx[ring].statss, + } while (u64_stats_fetch_retry_irq(&priv->rx[ring].statss, start)); data[i++] = tmp_rx_bytes; /* rx dropped packets */ @@ -313,9 +313,9 @@ static int gve_get_sset_count(struct net_device *netdev, int sset) data[i++] = tx->done; do { start = - u64_stats_fetch_begin(&priv->tx[ring].statss); + u64_stats_fetch_begin_irq(&priv->tx[ring].statss); tmp_tx_bytes = tx->bytes_done; - } while (u64_stats_fetch_retry(&priv->tx[ring].statss, + } while (u64_stats_fetch_retry_irq(&priv->tx[ring].statss, start)); data[i++] = tmp_tx_bytes; data[i++] = tx->wake_queue; diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 6cb75bb..f0c1e6c8 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -40,10 +40,10 @@ static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s) for (ring = 0; ring < priv->rx_cfg.num_queues; ring++) { do { start = - u64_stats_fetch_begin(&priv->rx[ring].statss); + u64_stats_fetch_begin_irq(&priv->rx[ring].statss); packets = priv->rx[ring].rpackets; bytes = priv->rx[ring].rbytes; - } while (u64_stats_fetch_retry(&priv->rx[ring].statss, + } while (u64_stats_fetch_retry_irq(&priv->rx[ring].statss, start)); s->rx_packets += packets; s->rx_bytes += bytes; @@ -53,10 +53,10 @@ static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s) for (ring = 0; ring < priv->tx_cfg.num_queues; ring++) { do { start = - u64_stats_fetch_begin(&priv->tx[ring].statss); + u64_stats_fetch_begin_irq(&priv->tx[ring].statss); packets = priv->tx[ring].pkt_done; bytes = priv->tx[ring].bytes_done; - } while (u64_stats_fetch_retry(&priv->tx[ring].statss, + } while (u64_stats_fetch_retry_irq(&priv->tx[ring].statss, start)); s->tx_packets += packets; s->tx_bytes += bytes; @@ -1041,9 +1041,9 @@ void gve_handle_report_stats(struct gve_priv *priv) if (priv->tx) { for (idx = 0; idx < priv->tx_cfg.num_queues; idx++) { do { - start = u64_stats_fetch_begin(&priv->tx[idx].statss); + start = u64_stats_fetch_begin_irq(&priv->tx[idx].statss); tx_bytes = priv->tx[idx].bytes_done; - } while (u64_stats_fetch_retry(&priv->tx[idx].statss, start)); + } while (u64_stats_fetch_retry_irq(&priv->tx[idx].statss, start)); stats[stats_idx++] = (struct stats) { .stat_name = cpu_to_be32(TX_WAKE_CNT), .value = cpu_to_be64(priv->tx[idx].wake_queue), diff --git a/drivers/net/ethernet/huawei/hinic/hinic_rx.c b/drivers/net/ethernet/huawei/hinic/hinic_rx.c index 57d5d79..1b57b67 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_rx.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_rx.c @@ -375,7 +375,7 @@ void hinic_rxq_get_stats(struct hinic_rxq *rxq, u64_stats_update_begin(&stats->syncp); do { - start = u64_stats_fetch_begin(&rxq_stats->syncp); + start = u64_stats_fetch_begin_irq(&rxq_stats->syncp); stats->bytes = rxq_stats->bytes; stats->packets = rxq_stats->packets; stats->errors = rxq_stats->csum_errors + @@ -384,7 +384,7 @@ void hinic_rxq_get_stats(struct hinic_rxq *rxq, stats->other_errors = rxq_stats->other_errors; stats->dropped = rxq_stats->dropped; stats->rx_buf_empty = rxq_stats->rx_buf_empty; - } while (u64_stats_fetch_retry(&rxq_stats->syncp, start)); + } while (u64_stats_fetch_retry_irq(&rxq_stats->syncp, start)); u64_stats_update_end(&stats->syncp); } diff --git a/drivers/net/ethernet/huawei/hinic/hinic_tx.c b/drivers/net/ethernet/huawei/hinic/hinic_tx.c index 75fa344..ff37b6f 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_tx.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_tx.c @@ -61,7 +61,7 @@ void hinic_txq_get_stats(struct hinic_txq *txq, u64_stats_update_begin(&stats->syncp); do { - start = u64_stats_fetch_begin(&txq_stats->syncp); + start = u64_stats_fetch_begin_irq(&txq_stats->syncp); stats->bytes = txq_stats->bytes; stats->packets = txq_stats->packets; stats->busy = txq_stats->busy; @@ -69,7 +69,7 @@ void hinic_txq_get_stats(struct hinic_txq *txq, stats->dropped = txq_stats->dropped; stats->big_frags_pkts = txq_stats->big_frags_pkts; stats->big_udp_pkts = txq_stats->big_udp_pkts; - } while (u64_stats_fetch_retry(&txq_stats->syncp, start)); + } while (u64_stats_fetch_retry_irq(&txq_stats->syncp, start)); u64_stats_update_end(&stats->syncp); } diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index dfc1f32..5ab230aa 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -3373,21 +3373,21 @@ static void nfp_net_stat64(struct net_device *netdev, unsigned int start; do { - start = u64_stats_fetch_begin(&r_vec->rx_sync); + start = u64_stats_fetch_begin_irq(&r_vec->rx_sync); data[0] = r_vec->rx_pkts; data[1] = r_vec->rx_bytes; data[2] = r_vec->rx_drops; - } while (u64_stats_fetch_retry(&r_vec->rx_sync, start)); + } while (u64_stats_fetch_retry_irq(&r_vec->rx_sync, start)); stats->rx_packets += data[0]; stats->rx_bytes += data[1]; stats->rx_dropped += data[2]; do { - start = u64_stats_fetch_begin(&r_vec->tx_sync); + start = u64_stats_fetch_begin_irq(&r_vec->tx_sync); data[0] = r_vec->tx_pkts; data[1] = r_vec->tx_bytes; data[2] = r_vec->tx_errors; - } while (u64_stats_fetch_retry(&r_vec->tx_sync, start)); + } while (u64_stats_fetch_retry_irq(&r_vec->tx_sync, start)); stats->tx_packets += data[0]; stats->tx_bytes += data[1]; stats->tx_errors += data[2]; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c index bfcd90f..d4136d3 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c @@ -498,7 +498,7 @@ static u64 *nfp_vnic_get_sw_stats(struct net_device *netdev, u64 *data) unsigned int start; do { - start = u64_stats_fetch_begin(&nn->r_vecs[i].rx_sync); + start = u64_stats_fetch_begin_irq(&nn->r_vecs[i].rx_sync); data[0] = nn->r_vecs[i].rx_pkts; tmp[0] = nn->r_vecs[i].hw_csum_rx_ok; tmp[1] = nn->r_vecs[i].hw_csum_rx_inner_ok; @@ -506,10 +506,10 @@ static u64 *nfp_vnic_get_sw_stats(struct net_device *netdev, u64 *data) tmp[3] = nn->r_vecs[i].hw_csum_rx_error; tmp[4] = nn->r_vecs[i].rx_replace_buf_alloc_fail; tmp[5] = nn->r_vecs[i].hw_tls_rx; - } while (u64_stats_fetch_retry(&nn->r_vecs[i].rx_sync, start)); + } while (u64_stats_fetch_retry_irq(&nn->r_vecs[i].rx_sync, start)); do { - start = u64_stats_fetch_begin(&nn->r_vecs[i].tx_sync); + start = u64_stats_fetch_begin_irq(&nn->r_vecs[i].tx_sync); data[1] = nn->r_vecs[i].tx_pkts; data[2] = nn->r_vecs[i].tx_busy; tmp[6] = nn->r_vecs[i].hw_csum_tx; @@ -519,7 +519,7 @@ static u64 *nfp_vnic_get_sw_stats(struct net_device *netdev, u64 *data) tmp[10] = nn->r_vecs[i].hw_tls_tx; tmp[11] = nn->r_vecs[i].tls_tx_fallback; tmp[12] = nn->r_vecs[i].tls_tx_no_fallback; - } while (u64_stats_fetch_retry(&nn->r_vecs[i].tx_sync, start)); + } while (u64_stats_fetch_retry_irq(&nn->r_vecs[i].tx_sync, start)); data += NN_RVEC_PER_Q_STATS; diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c index ad6dbf01..4fb0638 100644 --- a/drivers/net/netdevsim/netdev.c +++ b/drivers/net/netdevsim/netdev.c @@ -67,10 +67,10 @@ static int nsim_change_mtu(struct net_device *dev, int new_mtu) unsigned int start; do { - start = u64_stats_fetch_begin(&ns->syncp); + start = u64_stats_fetch_begin_irq(&ns->syncp); stats->tx_bytes = ns->tx_bytes; stats->tx_packets = ns->tx_packets; - } while (u64_stats_fetch_retry(&ns->syncp, start)); + } while (u64_stats_fetch_retry_irq(&ns->syncp, start)); } static int diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 461c037..cee39ae 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -2175,9 +2175,9 @@ static inline u64 sta_get_tidstats_msdu(struct ieee80211_sta_rx_stats *rxstats, u64 value; do { - start = u64_stats_fetch_begin(&rxstats->syncp); + start = u64_stats_fetch_begin_irq(&rxstats->syncp); value = rxstats->msdu[tid]; - } while (u64_stats_fetch_retry(&rxstats->syncp, start)); + } while (u64_stats_fetch_retry_irq(&rxstats->syncp, start)); return value; } @@ -2241,9 +2241,9 @@ static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats) u64 value; do { - start = u64_stats_fetch_begin(&rxstats->syncp); + start = u64_stats_fetch_begin_irq(&rxstats->syncp); value = rxstats->bytes; - } while (u64_stats_fetch_retry(&rxstats->syncp, start)); + } while (u64_stats_fetch_retry_irq(&rxstats->syncp, start)); return value; } diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 9c047c1..7239814 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1078,9 +1078,9 @@ static void mpls_get_stats(struct mpls_dev *mdev, p = per_cpu_ptr(mdev->stats, i); do { - start = u64_stats_fetch_begin(&p->syncp); + start = u64_stats_fetch_begin_irq(&p->syncp); local = p->stats; - } while (u64_stats_fetch_retry(&p->syncp, start)); + } while (u64_stats_fetch_retry_irq(&p->syncp, start)); stats->rx_packets += local.rx_packets; stats->rx_bytes += local.rx_bytes; -- 1.8.3.1

1 2

[OLK-5.10 1/7] geneve: do not use RT_TOS for IPv6 flowlabel
by Wang Yufen 03 Feb '23

03 Feb '23

From: Matthias May <matthias.may(a)westermo.com> stable inclusion from stable-v5.10.138 commit 38b83883ce4e4efe8ff0a727192219cac2668d42 category: bugfix bugzilla: 188217 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id… ------------------------------------------------- commit ca2bb69514a8bc7f83914122f0d596371352416c upstream. According to Guillaume Nault RT_TOS should never be used for IPv6. Quote: RT_TOS() is an old macro used to interprete IPv4 TOS as described in the obsolete RFC 1349. It's conceptually wrong to use it even in IPv4 code, although, given the current state of the code, most of the existing calls have no consequence. But using RT_TOS() in IPv6 code is always a bug: IPv6 never had a "TOS" field to be interpreted the RFC 1349 way. There's no historical compatibility to worry about. Fixes: 3a56f86f1be6 ("geneve: handle ipv6 priority like ipv4 tos") Acked-by: Guillaume Nault <gnault(a)redhat.com> Signed-off-by: Matthias May <matthias.may(a)westermo.com> Signed-off-by: Jakub Kicinski <kuba(a)kernel.org> Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org> (cherry picked from commit 38b83883ce4e4efe8ff0a727192219cac2668d42) Signed-off-by: Wang Yufen <wangyufen(a)huawei.com> --- drivers/net/geneve.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 5ddb2db..ba9947d 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -850,8 +850,7 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb, use_cache = false; } - fl6->flowlabel = ip6_make_flowinfo(RT_TOS(prio), - info->key.label); + fl6->flowlabel = ip6_make_flowinfo(prio, info->key.label); dst_cache = (struct dst_cache *)&info->dst_cache; if (use_cache) { dst = dst_cache_get_ip6(dst_cache, &fl6->saddr); -- 1.8.3.1

2 7

[PATCH openEuler-23.03] config: disable CONFIG_EFI_ZBOOT by default
by Xie XiuQi 01 Feb '23

01 Feb '23

CONFIG_EFI_ZBOOT is introduced to openEuler 22.03 LTS SP1 by this commit for loongarch and enabled by default: e46780727555("efi/libstub: implement generic EFI zboot"). However, if is enabled, the compiled version cannot be booted on openEuler 22.03 LTS. So, disable it on this version. Signed-off-by: Xie XiuQi <xiexiuqi(a)huawei.com> --- arch/arm64/configs/openeuler_defconfig | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index fbb71f7520a8..63f2adbbc778 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -15,18 +15,6 @@ CONFIG_INIT_ENV_ARG_LIMIT=32 CONFIG_LOCALVERSION="" # CONFIG_LOCALVERSION_AUTO is not set CONFIG_BUILD_SALT="" -CONFIG_HAVE_KERNEL_GZIP=y -CONFIG_HAVE_KERNEL_LZMA=y -CONFIG_HAVE_KERNEL_XZ=y -CONFIG_HAVE_KERNEL_LZO=y -CONFIG_HAVE_KERNEL_LZ4=y -CONFIG_HAVE_KERNEL_ZSTD=y -CONFIG_KERNEL_GZIP=y -# CONFIG_KERNEL_LZMA is not set -# CONFIG_KERNEL_XZ is not set -# CONFIG_KERNEL_LZO is not set -# CONFIG_KERNEL_LZ4 is not set -# CONFIG_KERNEL_ZSTD is not set CONFIG_DEFAULT_INIT="" CONFIG_DEFAULT_HOSTNAME="(none)" CONFIG_SYSVIPC=y @@ -2008,7 +1996,7 @@ CONFIG_EFI_SOFT_RESERVE=y CONFIG_EFI_PARAMS_FROM_FDT=y CONFIG_EFI_RUNTIME_WRAPPERS=y CONFIG_EFI_GENERIC_STUB=y -CONFIG_EFI_ZBOOT=y +# CONFIG_EFI_ZBOOT is not set CONFIG_EFI_ARMSTUB_DTB_LOADER=y CONFIG_EFI_GENERIC_STUB_INITRD_CMDLINE_LOADER=y # CONFIG_EFI_BOOTLOADER_CONTROL is not set -- 2.33.0

1 0

[PATCH openEuler-5.10] Add relationship graph of openEuler kernel branches
by Zheng Zengkai 01 Feb '23

01 Feb '23

openEuler inclusion category: doc bugzilla: https://gitee.com/openeuler/kernel/issues/I6CAZ5 ----------------------------- Newcomers may get confused about the relationship of openEuler kernel branches, so add relationship graph of openEuler kernel branches to README in openEuler kernel master branch. Signed-off-by: Zheng Zengkai <zhengzengkai(a)huawei.com> --- README_openEuler | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 README_openEuler diff --git a/README_openEuler b/README_openEuler new file mode 100644 index 000000000000..7ec06f5b2538 --- /dev/null +++ b/README_openEuler @@ -0,0 +1,45 @@ +README for openEuler kernel branches + +Relationship of openEuler kernel 4.19 branches: + +------------------------------------------------------------------> Linux 4.19.y + | + | tag: v4.19.13 + | + -----------------------------------------------------------> openEuler-1.0-LTS + +Relationship of openEuler kernel 5.10 branches: + +------------------------------------------------------------------> Linux 5.10.y + | + | tag: v5.10 + | + --------------------------------------------------------------------> OLK-5.10 + | | | | | + | tag: 5.10.0-4.0.0 | + | | | | | + ------^-----^-----^-----^--------------> openEuler-21.03(Innovative version) + | | | | + | tag: 5.10.0-5.0.0 + | | | | + ------^-----^-----^--------------> openEuler-21.09(Innovative version) + | | | + | tag: 5.10.0-60.0.0 + | | | + ------^-----^------------------------------> openEuler-22.03-LTS + | | + | tag: 5.10.0-106.0.0 + | | + ------^--------------> openEuler-22.09(Innovative version) + | + | tag: 5.10.0-136.0.0 + | + ---------------------------> openEuler-22.03-LTS-SP1 + +Relationship of openEuler kernel 6.x branches: + +-------------------------------------------------------------------> Linux 6.1.y + | + | tag: v6.1 + | + -----------------------------------------------> devel-6.1(Innovative version) -- 2.20.1

1 0