From: "Darrick J. Wong" djwong@kernel.org
mainline-inclusion from mainline-v5.13-rc4 commit f1bc5c5630f90b83b339e8970dcf6d03abba5bd5 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Merge these two inode walk loops together, since they're pretty similar now.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Lihong Kou koulihong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/xfs/xfs_icache.c | 162 +++++++++++++------------------------------- fs/xfs/xfs_icache.h | 1 + fs/xfs/xfs_trace.h | 5 +- 3 files changed, 53 insertions(+), 115 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index c27b1ef18372..d15087b89b33 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -43,6 +43,7 @@ enum xfs_icwalk_goal {
/* Goals directly associated with tagged inodes. */ XFS_ICWALK_BLOCKGC = XFS_ICI_BLOCKGC_TAG, + XFS_ICWALK_RECLAIM = XFS_ICI_RECLAIM_TAG, };
#define XFS_ICWALK_NULL_TAG (-1U) @@ -67,9 +68,13 @@ static int xfs_icwalk_ag(struct xfs_perag *pag, #define XFS_ICWALK_FLAG_DROP_GDQUOT (1U << 30) #define XFS_ICWALK_FLAG_DROP_PDQUOT (1U << 29)
+/* Stop scanning after icw_scan_limit inodes. */ +#define XFS_ICWALK_FLAG_SCAN_LIMIT (1U << 28) + #define XFS_ICWALK_PRIVATE_FLAGS (XFS_ICWALK_FLAG_DROP_UDQUOT | \ XFS_ICWALK_FLAG_DROP_GDQUOT | \ - XFS_ICWALK_FLAG_DROP_PDQUOT) + XFS_ICWALK_FLAG_DROP_PDQUOT | \ + XFS_ICWALK_FLAG_SCAN_LIMIT)
/* * Allocate and initialise an xfs_inode. @@ -756,17 +761,6 @@ xfs_icache_inode_is_allocated( return 0; }
-/* - * The inode lookup is done in batches to keep the amount of lock traffic and - * radix tree lookups to a minimum. The batch size is a trade off between - * lookup reduction and stack usage. This is in the reclaim path, so we can't - * be too greedy. - * - * XXX: This will be moved closer to xfs_icwalk* once we get rid of the - * separate reclaim walk functions. - */ -#define XFS_LOOKUP_BATCH 32 - #ifdef CONFIG_XFS_QUOTA /* Decide if we want to grab this inode to drop its dquots. */ static bool @@ -876,7 +870,7 @@ xfs_dqrele_all_inodes( * Return true if we grabbed it, false otherwise. */ static bool -xfs_reclaim_inode_grab( +xfs_reclaim_igrab( struct xfs_inode *ip) { ASSERT(rcu_read_lock_held()); @@ -988,108 +982,13 @@ xfs_reclaim_inode( xfs_iflags_clear(ip, XFS_IRECLAIM); }
-/* - * Walk the AGs and reclaim the inodes in them. Even if the filesystem is - * corrupted, we still want to try to reclaim all the inodes. If we don't, - * then a shut down during filesystem unmount reclaim walk leak all the - * unreclaimed inodes. - * - * Returns non-zero if any AGs or inodes were skipped in the reclaim pass - * so that callers that want to block until all dirty inodes are written back - * and reclaimed can sanely loop. - */ -static void -xfs_reclaim_inodes_ag( - struct xfs_mount *mp, - int *nr_to_scan) -{ - struct xfs_perag *pag; - xfs_agnumber_t ag = 0; - - while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { - unsigned long first_index = 0; - int done = 0; - int nr_found = 0; - - ag = pag->pag_agno + 1; - - first_index = READ_ONCE(pag->pag_ici_reclaim_cursor); - do { - struct xfs_inode *batch[XFS_LOOKUP_BATCH]; - int i; - - rcu_read_lock(); - nr_found = radix_tree_gang_lookup_tag( - &pag->pag_ici_root, - (void **)batch, first_index, - XFS_LOOKUP_BATCH, - XFS_ICI_RECLAIM_TAG); - if (!nr_found) { - done = 1; - rcu_read_unlock(); - break; - } - - /* - * Grab the inodes before we drop the lock. if we found - * nothing, nr == 0 and the loop will be skipped. - */ - for (i = 0; i < nr_found; i++) { - struct xfs_inode *ip = batch[i]; - - if (done || !xfs_reclaim_inode_grab(ip)) - batch[i] = NULL; - - /* - * Update the index for the next lookup. Catch - * overflows into the next AG range which can - * occur if we have inodes in the last block of - * the AG and we are currently pointing to the - * last inode. - * - * Because we may see inodes that are from the - * wrong AG due to RCU freeing and - * reallocation, only update the index if it - * lies in this AG. It was a race that lead us - * to see this inode, so another lookup from - * the same index will not find it again. - */ - if (XFS_INO_TO_AGNO(mp, ip->i_ino) != - pag->pag_agno) - continue; - first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); - if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) - done = 1; - } - - /* unlock now we've grabbed the inodes. */ - rcu_read_unlock(); - - for (i = 0; i < nr_found; i++) { - if (batch[i]) - xfs_reclaim_inode(batch[i], pag); - } - - *nr_to_scan -= XFS_LOOKUP_BATCH; - cond_resched(); - } while (nr_found && !done && *nr_to_scan > 0); - - if (done) - first_index = 0; - WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index); - xfs_perag_put(pag); - } -} - void xfs_reclaim_inodes( struct xfs_mount *mp) { - int nr_to_scan = INT_MAX; - while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { xfs_ail_push_all_sync(mp->m_ail); - xfs_reclaim_inodes_ag(mp, &nr_to_scan); + xfs_icwalk(mp, XFS_ICWALK_RECLAIM, NULL); } }
@@ -1105,11 +1004,16 @@ xfs_reclaim_inodes_nr( struct xfs_mount *mp, int nr_to_scan) { + struct xfs_eofblocks eofb = { + .eof_flags = XFS_ICWALK_FLAG_SCAN_LIMIT, + .icw_scan_limit = nr_to_scan, + }; + /* kick background reclaimer and push the AIL */ xfs_reclaim_work_queue(mp); xfs_ail_push_all(mp->m_ail);
- xfs_reclaim_inodes_ag(mp, &nr_to_scan); + xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &eofb); return 0; }
@@ -1219,9 +1123,8 @@ xfs_reclaim_worker( { struct xfs_mount *mp = container_of(to_delayed_work(work), struct xfs_mount, m_reclaim_work); - int nr_to_scan = INT_MAX;
- xfs_reclaim_inodes_ag(mp, &nr_to_scan); + xfs_icwalk(mp, XFS_ICWALK_RECLAIM, NULL); xfs_reclaim_work_queue(mp); }
@@ -1692,6 +1595,15 @@ xfs_blockgc_free_quota(
/* XFS Inode Cache Walking Code */
+/* + * The inode lookup is done in batches to keep the amount of lock traffic and + * radix tree lookups to a minimum. The batch size is a trade off between + * lookup reduction and stack usage. This is in the reclaim path, so we can't + * be too greedy. + */ +#define XFS_LOOKUP_BATCH 32 + + /* * Decide if we want to grab this inode in anticipation of doing work towards * the goal. @@ -1706,6 +1618,8 @@ xfs_icwalk_igrab( return xfs_dqrele_igrab(ip); case XFS_ICWALK_BLOCKGC: return xfs_blockgc_igrab(ip); + case XFS_ICWALK_RECLAIM: + return xfs_reclaim_igrab(ip); default: return false; } @@ -1719,6 +1633,7 @@ static inline int xfs_icwalk_process_inode( enum xfs_icwalk_goal goal, struct xfs_inode *ip, + struct xfs_perag *pag, struct xfs_eofblocks *eofb) { int error = 0; @@ -1730,6 +1645,9 @@ xfs_icwalk_process_inode( case XFS_ICWALK_BLOCKGC: error = xfs_blockgc_scan_inode(ip, eofb); break; + case XFS_ICWALK_RECLAIM: + xfs_reclaim_inode(ip, pag); + break; } return error; } @@ -1754,7 +1672,10 @@ xfs_icwalk_ag( restart: done = false; skipped = 0; - first_index = 0; + if (goal == XFS_ICWALK_RECLAIM) + first_index = READ_ONCE(pag->pag_ici_reclaim_cursor); + else + first_index = 0; nr_found = 0; do { struct xfs_inode *batch[XFS_LOOKUP_BATCH]; @@ -1775,6 +1696,7 @@ xfs_icwalk_ag( XFS_LOOKUP_BATCH, tag);
if (!nr_found) { + done = true; rcu_read_unlock(); break; } @@ -1814,7 +1736,8 @@ xfs_icwalk_ag( for (i = 0; i < nr_found; i++) { if (!batch[i]) continue; - error = xfs_icwalk_process_inode(goal, batch[i], eofb); + error = xfs_icwalk_process_inode(goal, batch[i], pag, + eofb); if (error == -EAGAIN) { skipped++; continue; @@ -1829,8 +1752,19 @@ xfs_icwalk_ag(
cond_resched();
+ if (eofb && (eofb->eof_flags & XFS_ICWALK_FLAG_SCAN_LIMIT)) { + eofb->icw_scan_limit -= XFS_LOOKUP_BATCH; + if (eofb->icw_scan_limit <= 0) + break; + } } while (nr_found && !done);
+ if (goal == XFS_ICWALK_RECLAIM) { + if (done) + first_index = 0; + WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index); + } + if (skipped) { delay(1); goto restart; diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 3ec00f1fea86..b6ab1067c52b 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -15,6 +15,7 @@ struct xfs_eofblocks { kgid_t eof_gid; prid_t eof_prid; __u64 eof_min_file_size; + int icw_scan_limit; };
/* diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index ab761b160e2c..0fb82dbc4352 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3878,6 +3878,7 @@ DECLARE_EVENT_CLASS(xfs_eofblocks_class, __field(uint32_t, gid) __field(prid_t, prid) __field(__u64, min_file_size) + __field(int, scan_limit) __field(unsigned long, caller_ip) ), TP_fast_assign( @@ -3889,15 +3890,17 @@ DECLARE_EVENT_CLASS(xfs_eofblocks_class, eofb->eof_gid) : 0; __entry->prid = eofb ? eofb->eof_prid : 0; __entry->min_file_size = eofb ? eofb->eof_min_file_size : 0; + __entry->scan_limit = eofb ? eofb->icw_scan_limit : 0; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d flags 0x%x uid %u gid %u prid %u minsize %llu caller %pS", + TP_printk("dev %d:%d flags 0x%x uid %u gid %u prid %u minsize %llu scan_limit %d caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->flags, __entry->uid, __entry->gid, __entry->prid, __entry->min_file_size, + __entry->scan_limit, (char *)__entry->caller_ip) ); #define DEFINE_EOFBLOCKS_EVENT(name) \