From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: NA CVE: NA
-------------------------------------------------
The Ascend910 platform only use the acpi mode to boot system, the oem message is recorded in the oem_table_id of IORT table, so use the oem message to enable the ascend features.
Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/mm/init.c | 7 ++++++- drivers/acpi/arm64/iort.c | 24 ++++++++++++++++++++++++ include/linux/init.h | 7 +++++++ 3 files changed, 37 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 883350f9cc42..f23da539a476 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -769,7 +769,7 @@ __setup("keepinitrd", keepinitrd_setup); #endif
#ifdef CONFIG_ASCEND_FEATURES -static int __init ascend_enable_setup(char *__unused) +void ascend_enable_all_features(void) { if (IS_ENABLED(CONFIG_ASCEND_DVPP_MMAP)) enable_mmap_dvpp = 1; @@ -782,6 +782,11 @@ static int __init ascend_enable_setup(char *__unused)
if (IS_ENABLED(CONFIG_SUSPEND)) mem_sleep_current = PM_SUSPEND_ON; +} + +static int __init ascend_enable_setup(char *__unused) +{ + ascend_enable_all_features();
return 1; } diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index 5a724bdb4a4d..7408ffab7303 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -25,6 +25,7 @@ #include <linux/pci.h> #include <linux/platform_device.h> #include <linux/slab.h> +#include <linux/init.h>
#define IORT_TYPE_MASK(type) (1 << (type)) #define IORT_MSI_TYPE (1 << ACPI_IORT_NODE_ITS_GROUP) @@ -1639,6 +1640,26 @@ static void __init iort_init_platform_devices(void) } }
+/* + * This function detects the ascend platform by oem table id. + */ +static bool ascend_platform_detected(struct acpi_table_header *h) +{ + if (!memcmp(h->oem_table_id, "HI19801P", ACPI_OEM_TABLE_ID_SIZE)) + return true; + + if (!memcmp(h->oem_table_id, "HI19802P", ACPI_OEM_TABLE_ID_SIZE)) + return true; + + if (!memcmp(h->oem_table_id, "HI19804P", ACPI_OEM_TABLE_ID_SIZE)) + return true; + + if (!memcmp(h->oem_table_id, "HI1980\0\0", ACPI_OEM_TABLE_ID_SIZE)) + return true; + + return false; +} + void __init acpi_iort_init(void) { acpi_status status; @@ -1654,5 +1675,8 @@ void __init acpi_iort_init(void) return; }
+ if (ascend_platform_detected(iort_table)) + ascend_enable_all_features(); + iort_init_platform_devices(); } diff --git a/include/linux/init.h b/include/linux/init.h index 2538d176dd1f..e6f970ac7bf4 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -306,4 +306,11 @@ void __init parse_early_options(char *cmdline); #define __exit_p(x) NULL #endif
+#ifndef __ASSEMBLY__ +#ifdef CONFIG_ASCEND_FEATURES +extern void ascend_enable_all_features(void); +#else +static inline void ascend_enable_all_features(void) { } +#endif +#endif #endif /* _LINUX_INIT_H */
From: "Darrick J. Wong" darrick.wong@oracle.com
mainline inclusion from mainline-5.1-rc1 commit 5837f62592ef7c07859a1b1a6e82030d85869f0f category: bugfix bugzilla: NA CVE: NA
https://gitee.com/openeuler/kernel/issues/I1CYGS?from=project-issue ---------------------------
Fix some indentation issues with the iunlink functions and reorganize the tops of the functions to be identical. No functional changes.
Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: zhangyi (F) yi.zhang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/xfs/xfs_inode.c | 79 +++++++++++++++++++--------------------------- 1 file changed, 32 insertions(+), 47 deletions(-)
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index f2d06e1e4906..1c0dd9106bb1 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1915,27 +1915,25 @@ xfs_inactive( */ STATIC int xfs_iunlink( - struct xfs_trans *tp, - struct xfs_inode *ip) + struct xfs_trans *tp, + struct xfs_inode *ip) { - xfs_mount_t *mp = tp->t_mountp; - xfs_agi_t *agi; - xfs_dinode_t *dip; - xfs_buf_t *agibp; - xfs_buf_t *ibp; - xfs_agino_t agino; - short bucket_index; - int offset; - int error; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agi *agi; + struct xfs_dinode *dip; + struct xfs_buf *agibp; + struct xfs_buf *ibp; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; + int offset; + int error;
ASSERT(VFS_I(ip)->i_nlink == 0); ASSERT(VFS_I(ip)->i_mode != 0);
- /* - * Get the agi buffer first. It ensures lock ordering - * on the list. - */ - error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp); + /* Get the agi buffer first. It ensures lock ordering on the list. */ + error = xfs_read_agi(mp, tp, agno, &agibp); if (error) return error; agi = XFS_BUF_TO_AGI(agibp); @@ -1944,9 +1942,6 @@ xfs_iunlink( * Get the index into the agi hash table for the * list this inode will go on. */ - agino = XFS_INO_TO_AGINO(mp, ip->i_ino); - ASSERT(agino != 0); - bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; ASSERT(agi->agi_unlinked[bucket_index]); ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
@@ -1993,45 +1988,35 @@ xfs_iunlink( */ STATIC int xfs_iunlink_remove( - xfs_trans_t *tp, - xfs_inode_t *ip) + struct xfs_trans *tp, + struct xfs_inode *ip) { - xfs_ino_t next_ino; - xfs_mount_t *mp; - xfs_agi_t *agi; - xfs_dinode_t *dip; - xfs_buf_t *agibp; - xfs_buf_t *ibp; - xfs_agnumber_t agno; - xfs_agino_t agino; - xfs_agino_t next_agino; - xfs_buf_t *last_ibp; - xfs_dinode_t *last_dip = NULL; - short bucket_index; - int offset, last_offset = 0; - int error; - - mp = tp->t_mountp; - agno = XFS_INO_TO_AGNO(mp, ip->i_ino); + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agi *agi; + struct xfs_dinode *dip; + struct xfs_buf *agibp; + struct xfs_buf *ibp; + struct xfs_buf *last_ibp; + struct xfs_dinode *last_dip = NULL; + xfs_ino_t next_ino; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + xfs_agino_t next_agino; + short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; + int offset; + int last_offset = 0; + int error;
- /* - * Get the agi buffer first. It ensures lock ordering - * on the list. - */ + /* Get the agi buffer first. It ensures lock ordering on the list. */ error = xfs_read_agi(mp, tp, agno, &agibp); if (error) return error; - agi = XFS_BUF_TO_AGI(agibp);
/* * Get the index into the agi hash table for the * list this inode will go on. */ - agino = XFS_INO_TO_AGINO(mp, ip->i_ino); - if (!xfs_verify_agino(mp, agno, agino)) - return -EFSCORRUPTED; - bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; if (!xfs_verify_agino(mp, agno, be32_to_cpu(agi->agi_unlinked[bucket_index]))) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
From: "Darrick J. Wong" darrick.wong@oracle.com
mainline inclusion from mainline-5.1-rc1 commit 7d36c19538d38f9ff6b93d2a3d23ee879b076dc6 category: bugfix bugzilla: NA CVE: NA
https://gitee.com/openeuler/kernel/issues/I1CYGS?from=project-issue ---------------------------
Add a new helper to check that a per-AG inode pointer is either null or points somewhere valid within that AG.
Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: zhangyi (F) yi.zhang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/xfs/libxfs/xfs_inode_buf.c | 3 +-- fs/xfs/libxfs/xfs_types.c | 13 +++++++++++++ fs/xfs/libxfs/xfs_types.h | 2 ++ fs/xfs/scrub/agheader.c | 8 +++----- 4 files changed, 19 insertions(+), 7 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 09d9c8cfa4a0..82c4374acb4c 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -99,8 +99,7 @@ xfs_inode_buf_verify( unlinked_ino = be32_to_cpu(dip->di_next_unlinked); di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && xfs_dinode_good_version(mp, dip->di_version) && - (unlinked_ino == NULLAGINO || - xfs_verify_agino(mp, agno, unlinked_ino)); + xfs_verify_agino_or_null(mp, agno, unlinked_ino); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP))) { if (readahead) { diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c index 33a5ca346baf..9873d2723fd9 100644 --- a/fs/xfs/libxfs/xfs_types.c +++ b/fs/xfs/libxfs/xfs_types.c @@ -116,6 +116,19 @@ xfs_verify_agino( return agino >= first && agino <= last; }
+/* + * Verify that an AG inode number pointer neither points outside the AG + * nor points at static metadata, or is NULLAGINO. + */ +bool +xfs_verify_agino_or_null( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agino_t agino) +{ + return agino == NULLAGINO || xfs_verify_agino(mp, agno, agino); +} + /* * Verify that an FS inode number pointer neither points outside the * filesystem nor points at static AG metadata. diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index b9e6c89284c3..b477dbe8859e 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -161,6 +161,8 @@ void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t *first, xfs_agino_t *last); bool xfs_verify_agino(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino); +bool xfs_verify_agino_or_null(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agino_t agino); bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino); diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 1038bc36c8ac..7fa6ca6721b0 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -875,19 +875,17 @@ xchk_agi(
/* Check inode pointers */ agino = be32_to_cpu(agi->agi_newino); - if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino)) + if (!xfs_verify_agino_or_null(mp, agno, agino)) xchk_block_set_corrupt(sc, sc->sa.agi_bp);
agino = be32_to_cpu(agi->agi_dirino); - if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino)) + if (!xfs_verify_agino_or_null(mp, agno, agino)) xchk_block_set_corrupt(sc, sc->sa.agi_bp);
/* Check unlinked inode buckets */ for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { agino = be32_to_cpu(agi->agi_unlinked[i]); - if (agino == NULLAGINO) - continue; - if (!xfs_verify_agino(mp, agno, agino)) + if (!xfs_verify_agino_or_null(mp, agno, agino)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); }
From: "Darrick J. Wong" darrick.wong@oracle.com
mainline inclusion from mainline-5.1-rc1 commit 9a4a5118644e41ac9da7fa7d87ff3b09e61304de category: bugfix bugzilla: NA CVE: NA
https://gitee.com/openeuler/kernel/issues/I1CYGS?from=project-issue ---------------------------
Split the AGI unlinked bucket updates into a separate function. No functional changes.
Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: zhangyi (F) yi.zhang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/xfs/xfs_inode.c | 65 ++++++++++++++++++++++++++++++++-------------- fs/xfs/xfs_trace.h | 26 +++++++++++++++++++ 2 files changed, 71 insertions(+), 20 deletions(-)
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 1c0dd9106bb1..7206569efa70 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1906,6 +1906,43 @@ xfs_inactive( xfs_qm_dqdetach(ip); }
+/* + * Point the AGI unlinked bucket at an inode and log the results. The caller + * is responsible for validating the old value. + */ +STATIC int +xfs_iunlink_update_bucket( + struct xfs_trans *tp, + xfs_agnumber_t agno, + struct xfs_buf *agibp, + unsigned int bucket_index, + xfs_agino_t new_agino) +{ + struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp); + xfs_agino_t old_value; + int offset; + + ASSERT(xfs_verify_agino_or_null(tp->t_mountp, agno, new_agino)); + + old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]); + trace_xfs_iunlink_update_bucket(tp->t_mountp, agno, bucket_index, + old_value, new_agino); + + /* + * We should never find the head of the list already set to the value + * passed in because either we're adding or removing ourselves from the + * head of the list. + */ + if (old_value == new_agino) + return -EFSCORRUPTED; + + agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino); + offset = offsetof(struct xfs_agi, agi_unlinked) + + (sizeof(xfs_agino_t) * bucket_index); + xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1); + return 0; +} + /* * This is called when the inode's link count has gone to 0 or we are creating * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0. @@ -1971,16 +2008,8 @@ xfs_iunlink( xfs_inobp_check(mp, ibp); }
- /* - * Point the bucket head pointer at the inode being inserted. - */ - ASSERT(agino != 0); - agi->agi_unlinked[bucket_index] = cpu_to_be32(agino); - offset = offsetof(xfs_agi_t, agi_unlinked) + - (sizeof(xfs_agino_t) * bucket_index); - xfs_trans_log_buf(tp, agibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - return 0; + /* Point the head of the list to point to this inode. */ + return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, agino); }
/* @@ -2056,16 +2085,12 @@ xfs_iunlink_remove( } else { xfs_trans_brelse(tp, ibp); } - /* - * Point the bucket head pointer at the next inode. - */ - ASSERT(next_agino != 0); - ASSERT(next_agino != agino); - agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino); - offset = offsetof(xfs_agi_t, agi_unlinked) + - (sizeof(xfs_agino_t) * bucket_index); - xfs_trans_log_buf(tp, agibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); + + /* Point the head of the list to the next unlinked inode. */ + error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, + next_agino); + if (error) + return error; } else { /* * We need to search the list for the inode being freed. diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 3043e5ed6495..a3e7813778b0 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3345,6 +3345,32 @@ DEFINE_TRANS_EVENT(xfs_trans_roll); DEFINE_TRANS_EVENT(xfs_trans_add_item); DEFINE_TRANS_EVENT(xfs_trans_free_items);
+TRACE_EVENT(xfs_iunlink_update_bucket, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int bucket, + xfs_agino_t old_ptr, xfs_agino_t new_ptr), + TP_ARGS(mp, agno, bucket, old_ptr, new_ptr), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(unsigned int, bucket) + __field(xfs_agino_t, old_ptr) + __field(xfs_agino_t, new_ptr) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->bucket = bucket; + __entry->old_ptr = old_ptr; + __entry->new_ptr = new_ptr; + ), + TP_printk("dev %d:%d agno %u bucket %u old 0x%x new 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->bucket, + __entry->old_ptr, + __entry->new_ptr) +); + #endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
From: Brian Foster bfoster@redhat.com
mainline inclusion from mainline-5.1-rc1 commit 627209fbcc2f0d658a5417645859a1d3053ddb59 category: bugfix bugzilla: NA CVE: NA
https://gitee.com/openeuler/kernel/issues/I1CZ01?from=project-issue ---------------------------
The writeback delalloc conversion code is racy with respect to changes in the currently cached file mapping. This stems from the fact that the bmapi allocation code requires a file range to allocate and the writeback conversion code assumes the range of the currently cached mapping is still valid with respect to the fork. It may not be valid, however, because the ilock is cycled (potentially multiple times) between the time the cached mapping was populated and the delalloc conversion occurs.
To facilitate a solution to this problem, create a new xfs_bmapi_delalloc() wrapper to xfs_bmapi_write() that takes a file (FSB) offset and attempts to allocate whatever delalloc extent backs the offset. Use a new bmapi flag to cause xfs_bmapi_write() to set the range based on the extent backing the bno parameter unless bno lands in a hole. If bno does land in a hole, fall back to the current behavior (which may result in an error or quietly skipping holes in the specified range depending on other parameters). This patch does not change behavior.
Signed-off-by: Brian Foster bfoster@redhat.com Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: zhangyi (F) yi.zhang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/xfs/libxfs/xfs_bmap.c | 59 +++++++++++++++++++++++++++++++++++++--- fs/xfs/libxfs/xfs_bmap.h | 4 +++ 2 files changed, 59 insertions(+), 4 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 0b7145fdb8aa..979d4480eb46 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4314,10 +4314,6 @@ xfs_bmapi_write( goto error0; }
- n = 0; - end = bno + len; - obno = bno; - if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.icur, &bma.got)) eof = true; if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev)) @@ -4327,6 +4323,26 @@ xfs_bmapi_write( bma.total = total; bma.datatype = 0;
+ /* + * The reval flag means the caller wants to allocate the entire delalloc + * extent backing bno where bno may not necessarily match the startoff. + * Now that we've looked up the extent, reset the range to map based on + * the extent in the file. If we're in a hole, this may be an error so + * don't adjust anything. + */ + if ((flags & XFS_BMAPI_REVALRANGE) && + !eof && bno >= bma.got.br_startoff) { + ASSERT(flags & XFS_BMAPI_DELALLOC); + bno = bma.got.br_startoff; + len = bma.got.br_blockcount; +#ifdef DEBUG + orig_bno = bno; + orig_len = len; +#endif + } + n = 0; + end = bno + len; + obno = bno; while (bno < end && n < *nmap) { bool need_alloc = false, wasdelay = false;
@@ -4483,6 +4499,41 @@ xfs_bmapi_write( return error; }
+/* + * Convert an existing delalloc extent to real blocks based on file offset. This + * attempts to allocate the entire delalloc extent and may require multiple + * invocations to allocate the target offset if a large enough physical extent + * is not available. + */ +int +xfs_bmapi_convert_delalloc( + struct xfs_trans *tp, + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, + int whichfork, + struct xfs_bmbt_irec *imap) +{ + int flags = XFS_BMAPI_DELALLOC; + int nimaps = 1; + int error; + int total = XFS_EXTENTADD_SPACE_RES(ip->i_mount, + XFS_DATA_FORK); + + if (whichfork == XFS_COW_FORK) + flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC; + + /* + * The reval flag means to allocate the entire extent; pass a dummy + * length of 1. + */ + flags |= XFS_BMAPI_REVALRANGE; + error = xfs_bmapi_write(tp, ip, offset_fsb, 1, flags, total, imap, + &nimaps); + if (!error && !nimaps) + error = -EFSCORRUPTED; + return error; +} + int xfs_bmapi_remap( struct xfs_trans *tp, diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 488dc8860fd7..cc7d8c4bfb32 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -107,6 +107,8 @@ struct xfs_extent_free_item /* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */ #define XFS_BMAPI_NORMAP 0x2000
+#define XFS_BMAPI_REVALRANGE 0x4000 + #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ @@ -228,6 +230,8 @@ int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc, struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur, int eof); +int xfs_bmapi_convert_delalloc(struct xfs_trans *, struct xfs_inode *, + xfs_fileoff_t, int, struct xfs_bmbt_irec *);
static inline void xfs_bmap_add_free(
From: Brian Foster bfoster@redhat.com
mainline inclusion from mainline-5.1-rc1 commit d9252d526ba66a8f95ad2830ae1b62825ef3dbd5 category: bugfix bugzilla: NA CVE: NA
https://gitee.com/openeuler/kernel/issues/I1CZ01?from=project-issue ---------------------------
The writeback code caches the current extent mapping across multiple xfs_do_writepage() calls to avoid repeated lookups for sequential pages backed by the same extent. This is known to be slightly racy with extent fork changes in certain difficult to reproduce scenarios. The cached extent is trimmed to within EOF to help avoid the most common vector for this problem via speculative preallocation management, but this is a band-aid that does not address the fundamental problem.
Now that we have an xfs_ifork sequence counter mechanism used to facilitate COW writeback, we can use the same mechanism to validate consistency between the data fork and cached writeback mappings. On its face, this is somewhat of a big hammer approach because any change to the data fork invalidates any mapping currently cached by a writeback in progress regardless of whether the data fork change overlaps with the range under writeback. In practice, however, the impact of this approach is minimal in most cases.
First, data fork changes (delayed allocations) caused by sustained sequential buffered writes are amortized across speculative preallocations. This means that a cached mapping won't be invalidated by each buffered write of a common file copy workload, but rather only on less frequent allocation events. Second, the extent tree is always entirely in-core so an additional lookup of a usable extent mostly costs a shared ilock cycle and in-memory tree lookup. This means that a cached mapping reval is relatively cheap compared to the I/O itself. Third, spurious invalidations don't impact ioend construction. This means that even if the same extent is revalidated multiple times across multiple writepage instances, we still construct and submit the same size ioend (and bio) if the blocks are physically contiguous.
Update struct xfs_writepage_ctx with a new field to hold the sequence number of the data fork associated with the currently cached mapping. Check the wpc seqno against the data fork when the mapping is validated and reestablish the mapping whenever the fork has changed since the mapping was cached. This ensures that writeback always uses a valid extent mapping and thus prevents lost writebacks and stale delalloc block problems.
Signed-off-by: Brian Foster bfoster@redhat.com Reviewed-by: Allison Henderson allison.henderson@oracle.com Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: zhangyi (F) yi.zhang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/xfs/xfs_aops.c | 60 ++++++++++++++++++++++++++++++++++++---------- fs/xfs/xfs_iomap.c | 5 ++-- 2 files changed, 49 insertions(+), 16 deletions(-)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index b697866946d2..7a6396bd8114 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -29,6 +29,7 @@ struct xfs_writepage_ctx { struct xfs_bmbt_irec imap; unsigned int io_type; + unsigned int data_seq; unsigned int cow_seq; struct xfs_ioend *ioend; }; @@ -301,6 +302,42 @@ xfs_end_bio( xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status)); }
+/* + * Fast revalidation of the cached writeback mapping. Return true if the current + * mapping is valid, false otherwise. + */ +static bool +xfs_imap_valid( + struct xfs_writepage_ctx *wpc, + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb) +{ + if (offset_fsb < wpc->imap.br_startoff || + offset_fsb >= wpc->imap.br_startoff + wpc->imap.br_blockcount) + return false; + /* + * If this is a COW mapping, it is sufficient to check that the mapping + * covers the offset. Be careful to check this first because the caller + * can revalidate a COW mapping without updating the data seqno. + */ + if (wpc->io_type == XFS_IO_COW) + return true; + + /* + * This is not a COW mapping. Check the sequence number of the data fork + * because concurrent changes could have invalidated the extent. Check + * the COW fork because concurrent changes since the last time we + * checked (and found nothing at this offset) could have added + * overlapping blocks. + */ + if (wpc->data_seq != READ_ONCE(ip->i_df.if_seq)) + return false; + if (xfs_inode_has_cow_data(ip) && + wpc->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) + return false; + return true; +} + STATIC int xfs_map_blocks( struct xfs_writepage_ctx *wpc, @@ -315,9 +352,11 @@ xfs_map_blocks( struct xfs_bmbt_irec imap; int whichfork = XFS_DATA_FORK; struct xfs_iext_cursor icur; - bool imap_valid; int error = 0;
+ if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + /* * We have to make sure the cached mapping is within EOF to protect * against eofblocks trimming on file release leaving us with a stale @@ -346,17 +385,9 @@ xfs_map_blocks( * against concurrent updates and provides a memory barrier on the way * out that ensures that we always see the current value. */ - imap_valid = offset_fsb >= wpc->imap.br_startoff && - offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount; - if (imap_valid && - (!xfs_inode_has_cow_data(ip) || - wpc->io_type == XFS_IO_COW || - wpc->cow_seq == READ_ONCE(ip->i_cowfp->if_seq))) + if (xfs_imap_valid(wpc, ip, offset_fsb)) return 0;
- if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - /* * If we don't have a valid map, now it's time to get a new one for this * offset. This will convert delayed allocations (including COW ones) @@ -403,9 +434,10 @@ xfs_map_blocks( }
/* - * Map valid and no COW extent in the way? We're done. + * No COW extent overlap. Revalidate now that we may have updated + * ->cow_seq. If the data mapping is still valid, we're done. */ - if (imap_valid) { + if (xfs_imap_valid(wpc, ip, offset_fsb)) { xfs_iunlock(ip, XFS_ILOCK_SHARED); return 0; } @@ -417,6 +449,7 @@ xfs_map_blocks( */ if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) imap.br_startoff = end_fsb; /* fake a hole past EOF */ + wpc->data_seq = READ_ONCE(ip->i_df.if_seq); xfs_iunlock(ip, XFS_ILOCK_SHARED);
if (imap.br_startoff > offset_fsb) { @@ -454,7 +487,8 @@ xfs_map_blocks( return 0; allocate_blocks: error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap, - &wpc->cow_seq); + whichfork == XFS_COW_FORK ? + &wpc->cow_seq : &wpc->data_seq); if (error) return error; ASSERT(whichfork == XFS_COW_FORK || cow_fsb == NULLFILEOFF || diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 6320aca39f39..7c544c41c1d1 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -652,7 +652,7 @@ xfs_iomap_write_allocate( int whichfork, xfs_off_t offset, xfs_bmbt_irec_t *imap, - unsigned int *cow_seq) + unsigned int *seq) { xfs_mount_t *mp = ip->i_mount; struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); @@ -768,8 +768,7 @@ xfs_iomap_write_allocate( if (error) goto error0;
- if (whichfork == XFS_COW_FORK) - *cow_seq = READ_ONCE(ifp->if_seq); + *seq = READ_ONCE(ifp->if_seq); xfs_iunlock(ip, XFS_ILOCK_EXCL); }
From: Brian Foster bfoster@redhat.com
mainline inclusion from mainline-5.1-rc1 commit c2b3164320b51a535d7c7a6acdcee255edbb22cf category: bugfix bugzilla: NA CVE: NA
https://gitee.com/openeuler/kernel/issues/I1CZ01?from=project-issue ---------------------------
The writeback delalloc conversion code is racy with respect to changes in the currently cached file mapping outside of the current page. This is because the ilock is cycled between the time the caller originally looked up the mapping and across each real allocation of the provided file range. This code has collected various hacks over the years to help combat the symptoms of these races (i.e., truncate race detection, allocation into hole detection, etc.), but none address the fundamental problem that the imap may not be valid at allocation time.
Rather than continue to use race detection hacks, update writeback delalloc conversion to a model that explicitly converts the delalloc extent backing the current file offset being processed. The current file offset is the only block we can trust to remain once the ilock is dropped because any operation that can remove the block (truncate, hole punch, etc.) must flush and discard pagecache pages first.
Modify xfs_iomap_write_allocate() to use the xfs_bmapi_delalloc() mechanism to request allocation of the entire delalloc extent backing the current offset instead of assuming the extent passed by the caller is unchanged. Record the range specified by the caller and apply it to the resulting allocated extent so previous checks by the caller for COW fork overlap are not lost. Finally, overload the bmapi delalloc flag with the range reval flag behavior since this is the only use case for both.
This ensures that writeback always picks up the correct and current extent associated with the page, regardless of races with other extent modifying operations. If operating on a data fork and the COW overlap state has changed since the ilock was cycled, the caller revalidates against the COW fork sequence number before using the imap for the next block.
Signed-off-by: Brian Foster bfoster@redhat.com Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: zhangyi (F) yi.zhang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/xfs/libxfs/xfs_bmap.c | 16 ++-- fs/xfs/libxfs/xfs_bmap.h | 2 - fs/xfs/xfs_iomap.c | 170 +++++++++++++-------------------------- 3 files changed, 64 insertions(+), 124 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 979d4480eb46..02cdcd999cf6 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4324,15 +4324,14 @@ xfs_bmapi_write( bma.datatype = 0;
/* - * The reval flag means the caller wants to allocate the entire delalloc - * extent backing bno where bno may not necessarily match the startoff. - * Now that we've looked up the extent, reset the range to map based on - * the extent in the file. If we're in a hole, this may be an error so - * don't adjust anything. + * The delalloc flag means the caller wants to allocate the entire + * delalloc extent backing bno where bno may not necessarily match the + * startoff. Now that we've looked up the extent, reset the range to + * map based on the extent in the file. If we're in a hole, this may be + * an error so don't adjust anything. */ - if ((flags & XFS_BMAPI_REVALRANGE) && + if ((flags & XFS_BMAPI_DELALLOC) && !eof && bno >= bma.got.br_startoff) { - ASSERT(flags & XFS_BMAPI_DELALLOC); bno = bma.got.br_startoff; len = bma.got.br_blockcount; #ifdef DEBUG @@ -4523,10 +4522,9 @@ xfs_bmapi_convert_delalloc( flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC;
/* - * The reval flag means to allocate the entire extent; pass a dummy + * The delalloc flag means to allocate the entire extent; pass a dummy * length of 1. */ - flags |= XFS_BMAPI_REVALRANGE; error = xfs_bmapi_write(tp, ip, offset_fsb, 1, flags, total, imap, &nimaps); if (!error && !nimaps) diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index cc7d8c4bfb32..dc15440bff5c 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -107,8 +107,6 @@ struct xfs_extent_free_item /* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */ #define XFS_BMAPI_NORMAP 0x2000
-#define XFS_BMAPI_REVALRANGE 0x4000 - #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 7c544c41c1d1..12df214edbe1 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -648,25 +648,19 @@ xfs_file_iomap_begin_delay( */ int xfs_iomap_write_allocate( - xfs_inode_t *ip, - int whichfork, - xfs_off_t offset, - xfs_bmbt_irec_t *imap, - unsigned int *seq) + struct xfs_inode *ip, + int whichfork, + xfs_off_t offset, + struct xfs_bmbt_irec *imap, + unsigned int *seq) { - xfs_mount_t *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); - xfs_fileoff_t offset_fsb, last_block; - xfs_fileoff_t end_fsb, map_start_fsb; - xfs_filblks_t count_fsb; - xfs_trans_t *tp; - int nimaps; - int error = 0; - int flags = XFS_BMAPI_DELALLOC; - int nres; - - if (whichfork == XFS_COW_FORK) - flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC; + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + xfs_fileoff_t offset_fsb; + xfs_fileoff_t map_start_fsb; + xfs_extlen_t map_count_fsb; + struct xfs_trans *tp; + int error = 0;
/* * Make sure that the dquots are there. @@ -675,106 +669,60 @@ xfs_iomap_write_allocate( if (error) return error;
+ /* + * Store the file range the caller is interested in because it encodes + * state such as potential overlap with COW fork blocks. We must trim + * the allocated extent down to this range to maintain consistency with + * what the caller expects. Revalidation of the range itself is the + * responsibility of the caller. + */ offset_fsb = XFS_B_TO_FSBT(mp, offset); - count_fsb = imap->br_blockcount; map_start_fsb = imap->br_startoff; + map_count_fsb = imap->br_blockcount;
- XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb)); + XFS_STATS_ADD(mp, xs_xstrat_bytes, + XFS_FSB_TO_B(mp, imap->br_blockcount));
- while (count_fsb != 0) { + while (true) { /* - * Set up a transaction with which to allocate the - * backing store for the file. Do allocations in a - * loop until we get some space in the range we are - * interested in. The other space that might be allocated - * is in the delayed allocation extent on which we sit - * but before our buffer starts. + * Allocate in a loop because it may take several attempts to + * allocate real blocks for a contiguous delalloc extent if free + * space is sufficiently fragmented. Note that space for the + * extent and indirect blocks was reserved when the delalloc + * extent was created so there's no need to do so here. */ - nimaps = 0; - while (nimaps == 0) { - nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); - /* - * We have already reserved space for the extent and any - * indirect blocks when creating the delalloc extent, - * there is no need to reserve space in this transaction - * again. - */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, - 0, XFS_TRANS_RESERVE, &tp); - if (error) - return error; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); - - /* - * it is possible that the extents have changed since - * we did the read call as we dropped the ilock for a - * while. We have to be careful about truncates or hole - * punchs here - we are not allowed to allocate - * non-delalloc blocks here. - * - * The only protection against truncation is the pages - * for the range we are being asked to convert are - * locked and hence a truncate will block on them - * first. - * - * As a result, if we go beyond the range we really - * need and hit an delalloc extent boundary followed by - * a hole while we have excess blocks in the map, we - * will fill the hole incorrectly and overrun the - * transaction reservation. - * - * Using a single map prevents this as we are forced to - * check each map we look for overlap with the desired - * range and abort as soon as we find it. Also, given - * that we only return a single map, having one beyond - * what we can return is probably a bit silly. - * - * We also need to check that we don't go beyond EOF; - * this is a truncate optimisation as a truncate sets - * the new file size before block on the pages we - * currently have locked under writeback. Because they - * are about to be tossed, we don't need to write them - * back.... - */ - nimaps = 1; - end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); - error = xfs_bmap_last_offset(ip, &last_block, - XFS_DATA_FORK); - if (error) - goto trans_cancel; + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, + XFS_TRANS_RESERVE, &tp); + if (error) + return error;
- last_block = XFS_FILEOFF_MAX(last_block, end_fsb); - if ((map_start_fsb + count_fsb) > last_block) { - count_fsb = last_block - map_start_fsb; - if (count_fsb == 0) { - error = -EAGAIN; - goto trans_cancel; - } - } + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0);
- /* - * From this point onwards we overwrite the imap - * pointer that the caller gave to us. - */ - error = xfs_bmapi_write(tp, ip, map_start_fsb, - count_fsb, flags, nres, imap, - &nimaps); - if (error) - goto trans_cancel; + /* + * ilock was dropped since imap was populated which means it + * might no longer be valid. The current page is held locked so + * nothing could have removed the block backing offset_fsb. + * Attempt to allocate whatever delalloc extent currently backs + * offset_fsb and put the result in the imap pointer from the + * caller. We'll trim it down to the caller's most recently + * validated range before we return. + */ + error = xfs_bmapi_convert_delalloc(tp, ip, offset_fsb, + whichfork, imap); + if (error) + goto trans_cancel;
- error = xfs_trans_commit(tp); - if (error) - goto error0; + error = xfs_trans_commit(tp); + if (error) + goto error0;
- *seq = READ_ONCE(ifp->if_seq); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } + *seq = READ_ONCE(ifp->if_seq); + xfs_iunlock(ip, XFS_ILOCK_EXCL);
/* - * See if we were able to allocate an extent that - * covers at least part of the callers request + * See if we were able to allocate an extent that covers at + * least part of the callers request. */ if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) return xfs_alert_fsblock_zero(ip, imap); @@ -783,15 +731,11 @@ xfs_iomap_write_allocate( (offset_fsb < (imap->br_startoff + imap->br_blockcount))) { XFS_STATS_INC(mp, xs_xstrat_quick); + xfs_trim_extent(imap, map_start_fsb, map_count_fsb); + ASSERT(offset_fsb >= imap->br_startoff && + offset_fsb < imap->br_startoff + imap->br_blockcount); return 0; } - - /* - * So far we have not mapped the requested part of the - * file, just surrounding data, try again. - */ - count_fsb -= imap->br_blockcount; - map_start_fsb = imap->br_startoff + imap->br_blockcount; }
trans_cancel:
From: Eric Sandeen sandeen@redhat.com
mainline inclusion from mainline-v5.9-rc4 commit f4020438fab05364018c91f7e02ebdd192085933 category: bugfix bugzilla: NA CVE: CVE-2020-14385
--------------------------------
The boundary test for the fixed-offset parts of xfs_attr_sf_entry in xfs_attr_shortform_verify is off by one, because the variable array at the end is defined as nameval[1] not nameval[]. Hence we need to subtract 1 from the calculation.
This can be shown by:
# touch file # setfattr -n root.a file
and verifications will fail when it's written to disk.
This only matters for a last attribute which has a single-byte name and no value, otherwise the combination of namelen & valuelen will push endp further out and this test won't fail.
Fixes: 1e1bbd8e7ee06 ("xfs: create structure verifier function for shortform xattrs") Signed-off-by: Eric Sandeen sandeen@redhat.com Reviewed-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Reviewed-by: zhangyi (F) yi.zhang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/xfs/libxfs/xfs_attr_leaf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index ceb9b0045725..39025b62b449 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -937,8 +937,10 @@ xfs_attr_shortform_verify( * struct xfs_attr_sf_entry has a variable length. * Check the fixed-offset parts of the structure are * within the data buffer. + * xfs_attr_sf_entry is defined with a 1-byte variable + * array at the end, so we must subtract that off. */ - if (((char *)sfep + sizeof(*sfep)) >= endp) + if (((char *)sfep + sizeof(*sfep) - 1) >= endp) return __this_address;
/* Don't allow names with known bad length. */
From: Ilya Dryomov idryomov@gmail.com
mainline inclusion from mainline-v5.9-rc5 commit f44d04e696feaf13d192d942c4f14ad2e117065a category: bugfix bugzilla: NA CVE: CVE-2020-25284
--------------------------------
It turns out that currently we rely only on sysfs attribute permissions:
$ ll /sys/bus/rbd/{add*,remove*} --w------- 1 root root 4096 Sep 3 20:37 /sys/bus/rbd/add --w------- 1 root root 4096 Sep 3 20:37 /sys/bus/rbd/add_single_major --w------- 1 root root 4096 Sep 3 20:37 /sys/bus/rbd/remove --w------- 1 root root 4096 Sep 3 20:38 /sys/bus/rbd/remove_single_major
This means that images can be mapped and unmapped (i.e. block devices can be created and deleted) by a UID 0 process even after it drops all privileges or by any process with CAP_DAC_OVERRIDE in its user namespace as long as UID 0 is mapped into that user namespace.
Be consistent with other virtual block devices (loop, nbd, dm, md, etc) and require CAP_SYS_ADMIN in the initial user namespace for mapping and unmapping, and also for dumping the configuration string and refreshing the image header.
Cc: stable@vger.kernel.org Signed-off-by: Ilya Dryomov idryomov@gmail.com Reviewed-by: Jeff Layton jlayton@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/block/rbd.c | 12 ++++++++++++ 1 file changed, 12 insertions(+)
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 110129097169..9f1265ce2e36 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4124,6 +4124,9 @@ static ssize_t rbd_config_info_show(struct device *dev, { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+ if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + return sprintf(buf, "%s\n", rbd_dev->config_info); }
@@ -4235,6 +4238,9 @@ static ssize_t rbd_image_refresh(struct device *dev, struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); int ret;
+ if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + ret = rbd_dev_refresh(rbd_dev); if (ret) return ret; @@ -5846,6 +5852,9 @@ static ssize_t do_rbd_add(struct bus_type *bus, struct rbd_client *rbdc; int rc;
+ if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!try_module_get(THIS_MODULE)) return -ENODEV;
@@ -5995,6 +6004,9 @@ static ssize_t do_rbd_remove(struct bus_type *bus, bool force = false; int ret;
+ if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + dev_id = -1; opt_buf[0] = '\0'; sscanf(buf, "%d %5s", &dev_id, opt_buf);
From: Eric Dumazet edumazet@google.com
mainline inclusion from mainline-v5.3-rc1 commit 8e8e2951e3095732d7e780c241f61ea130955a57 category: bugfix bugzilla: NA CVE: CVE-2020-14386
Prepare for fixing CVE-2020-14386. --------------------------------
Under DDOS, we want to be able to increment tp_drops without touching the spinlock. This will help readers to drain the receive queue slightly faster :/
Signed-off-by: Eric Dumazet edumazet@google.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/packet/af_packet.c | 20 +++++++++++--------- net/packet/internal.h | 1 + 2 files changed, 12 insertions(+), 9 deletions(-)
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 391671a77e96..b6231af63968 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -767,7 +767,7 @@ static void prb_close_block(struct tpacket_kbdq_core *pkc1, struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; struct sock *sk = &po->sk;
- if (po->stats.stats3.tp_drops) + if (atomic_read(&po->tp_drops)) status |= TP_STATUS_LOSING;
last_pkt = (struct tpacket3_hdr *)pkc1->prev; @@ -2133,10 +2133,8 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
drop_n_acct: is_drop_n_account = true; - spin_lock(&sk->sk_receive_queue.lock); - po->stats.stats1.tp_drops++; + atomic_inc(&po->tp_drops); atomic_inc(&sk->sk_drops); - spin_unlock(&sk->sk_receive_queue.lock);
drop_n_restore: if (skb_head != skb->data && skb_shared(skb)) { @@ -2288,7 +2286,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, * Anyways, moving it for V1/V2 only as V3 doesn't need this * at packet level. */ - if (po->stats.stats1.tp_drops) + if (atomic_read(&po->tp_drops)) status |= TP_STATUS_LOSING; }
@@ -2401,9 +2399,9 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, return 0;
drop_n_account: - is_drop_n_account = true; - po->stats.stats1.tp_drops++; spin_unlock(&sk->sk_receive_queue.lock); + atomic_inc(&po->tp_drops); + is_drop_n_account = true;
sk->sk_data_ready(sk); kfree_skb(copy_skb); @@ -3917,6 +3915,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, void *data = &val; union tpacket_stats_u st; struct tpacket_rollover_stats rstats; + int drops;
if (level != SOL_PACKET) return -ENOPROTOOPT; @@ -3933,14 +3932,17 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, memcpy(&st, &po->stats, sizeof(st)); memset(&po->stats, 0, sizeof(po->stats)); spin_unlock_bh(&sk->sk_receive_queue.lock); + drops = atomic_xchg(&po->tp_drops, 0);
if (po->tp_version == TPACKET_V3) { lv = sizeof(struct tpacket_stats_v3); - st.stats3.tp_packets += st.stats3.tp_drops; + st.stats3.tp_drops = drops; + st.stats3.tp_packets += drops; data = &st.stats3; } else { lv = sizeof(struct tpacket_stats); - st.stats1.tp_packets += st.stats1.tp_drops; + st.stats1.tp_drops = drops; + st.stats1.tp_packets += drops; data = &st.stats1; }
diff --git a/net/packet/internal.h b/net/packet/internal.h index f10294800aaf..907f4cd2a718 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -135,6 +135,7 @@ struct packet_sock { struct net_device __rcu *cached_dev; int (*xmit)(struct sk_buff *skb); struct packet_type prot_hook ____cacheline_aligned_in_smp; + atomic_t tp_drops ____cacheline_aligned_in_smp; };
static struct packet_sock *pkt_sk(struct sock *sk)
From: Or Cohen orcohen@paloaltonetworks.com
mainline inclusion from mainline-v5.9-rc4 commit acf69c946233259ab4d64f8869d4037a198c7f06 category: bugfix bugzilla: NA CVE: CVE-2020-14386
--------------------------------
Using tp_reserve to calculate netoff can overflow as tp_reserve is unsigned int and netoff is unsigned short.
This may lead to macoff receving a smaller value then sizeof(struct virtio_net_hdr), and if po->has_vnet_hdr is set, an out-of-bounds write will occur when calling virtio_net_hdr_from_skb.
The bug is fixed by converting netoff to unsigned int and checking if it exceeds USHRT_MAX.
This addresses CVE-2020-14386
Fixes: 8913336a7e8d ("packet: add PACKET_RESERVE sockopt") Signed-off-by: Or Cohen orcohen@paloaltonetworks.com Signed-off-by: Eric Dumazet edumazet@google.com Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/packet/af_packet.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index b6231af63968..c5a325697642 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2160,7 +2160,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, int skb_len = skb->len; unsigned int snaplen, res; unsigned long status = TP_STATUS_USER; - unsigned short macoff, netoff, hdrlen; + unsigned short macoff, hdrlen; + unsigned int netoff; struct sk_buff *copy_skb = NULL; struct timespec ts; __u32 ts_status; @@ -2223,6 +2224,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, } macoff = netoff - maclen; } + if (netoff > USHRT_MAX) { + atomic_inc(&po->tp_drops); + goto drop_n_restore; + } if (po->tp_version <= TPACKET_V2) { if (macoff + snaplen > po->rx_ring.frame_size) { if (po->copy_thresh &&
From: Muchun Song songmuchun@bytedance.com
mainline inclusion from mainline-v5.9-rc4 commit 17743798d81238ab13050e8e2833699b54e15467 category: bugfix bugzilla: NA CVE: CVE-2020-25285
--------------------------------
There is a race between the assignment of `table->data` and write value to the pointer of `table->data` in the __do_proc_doulongvec_minmax() on the other thread.
CPU0: CPU1: proc_sys_write hugetlb_sysctl_handler proc_sys_call_handler hugetlb_sysctl_handler_common hugetlb_sysctl_handler table->data = &tmp; hugetlb_sysctl_handler_common table->data = &tmp; proc_doulongvec_minmax do_proc_doulongvec_minmax sysctl_head_finish __do_proc_doulongvec_minmax unuse_table i = table->data; *i = val; // corrupt CPU1's stack
Fix this by duplicating the `table`, and only update the duplicate of it. And introduce a helper of proc_hugetlb_doulongvec_minmax() to simplify the code.
The following oops was seen:
BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor instruction fetch in kernel mode #PF: error_code(0x0010) - not-present page Code: Bad RIP value. ... Call Trace: ? set_max_huge_pages+0x3da/0x4f0 ? alloc_pool_huge_page+0x150/0x150 ? proc_doulongvec_minmax+0x46/0x60 ? hugetlb_sysctl_handler_common+0x1c7/0x200 ? nr_hugepages_store+0x20/0x20 ? copy_fd_bitmaps+0x170/0x170 ? hugetlb_sysctl_handler+0x1e/0x20 ? proc_sys_call_handler+0x2f1/0x300 ? unregister_sysctl_table+0xb0/0xb0 ? __fd_install+0x78/0x100 ? proc_sys_write+0x14/0x20 ? __vfs_write+0x4d/0x90 ? vfs_write+0xef/0x240 ? ksys_write+0xc0/0x160 ? __ia32_sys_read+0x50/0x50 ? __close_fd+0x129/0x150 ? __x64_sys_write+0x43/0x50 ? do_syscall_64+0x6c/0x200 ? entry_SYSCALL_64_after_hwframe+0x44/0xa9
Fixes: e5ff215941d5 ("hugetlb: multiple hstates for multiple page sizes") Signed-off-by: Muchun Song songmuchun@bytedance.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Reviewed-by: Mike Kravetz mike.kravetz@oracle.com Cc: Andi Kleen ak@linux.intel.com Link: http://lkml.kernel.org/r/20200828031146.43035-1-songmuchun@bytedance.com Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/hugetlb.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5356f7e5daf2..efb68abe9f35 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3045,6 +3045,22 @@ static unsigned int cpuset_mems_nr(unsigned int *array) }
#ifdef CONFIG_SYSCTL +static int proc_hugetlb_doulongvec_minmax(struct ctl_table *table, int write, + void *buffer, size_t *length, + loff_t *ppos, unsigned long *out) +{ + struct ctl_table dup_table; + + /* + * In order to avoid races with __do_proc_doulongvec_minmax(), we + * can duplicate the @table and alter the duplicate of it. + */ + dup_table = *table; + dup_table.data = out; + + return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos); +} + static int hugetlb_sysctl_handler_common(bool obey_mempolicy, struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) @@ -3056,9 +3072,8 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, if (!hugepages_supported()) return -EOPNOTSUPP;
- table->data = &tmp; - table->maxlen = sizeof(unsigned long); - ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, + &tmp); if (ret) goto out;
@@ -3115,9 +3130,8 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, if (write && hstate_is_gigantic(h)) return -EINVAL;
- table->data = &tmp; - table->maxlen = sizeof(unsigned long); - ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, + &tmp); if (ret) goto out;
From: Linus Torvalds torvalds@linux-foundation.org
mainline inclusion from mainline-v5.9 commit 50145474f6ef4a9c19205b173da6264a644c7489 category: bugfix bugzilla: NA CVE: CVE-2020-14390
--------------------------------
This (and the VGA soft scrollback) turns out to have various nasty small special cases that nobody really is willing to fight. The soft scrollback code was really useful a few decades ago when you typically used the console interactively as the main way to interact with the machine, but that just isn't the case any more.
So it's not worth dragging along.
Tested-by: Yuan Ming yuanmingbuaa@gmail.com Tested-by: Willy Tarreau w@1wt.eu Acked-by: Bartlomiej Zolnierkiewicz b.zolnierkie@samsung.com Acked-by: Daniel Vetter daniel.vetter@ffwll.ch Reviewed-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/video/fbdev/core/fbcon.c | 334 +------------------------------ 1 file changed, 4 insertions(+), 330 deletions(-)
diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index cb93a6b38160..11d8a1a04799 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -102,12 +102,6 @@ static int logo_lines; /* logo_shown is an index to vc_cons when >= 0; otherwise follows FBCON_LOGO enums. */ static int logo_shown = FBCON_LOGO_CANSHOW; -/* Software scrollback */ -static int fbcon_softback_size = 32768; -static unsigned long softback_buf, softback_curr; -static unsigned long softback_in; -static unsigned long softback_top, softback_end; -static int softback_lines; /* console mappings */ static int first_fb_vc; static int last_fb_vc = MAX_NR_CONSOLES - 1; @@ -148,8 +142,6 @@ static int margin_color;
static const struct consw fb_con;
-#define CM_SOFTBACK (8) - #define advance_row(p, delta) (unsigned short *)((unsigned long)(p) + (delta) * vc->vc_size_row)
static int fbcon_set_origin(struct vc_data *); @@ -355,18 +347,6 @@ static int get_color(struct vc_data *vc, struct fb_info *info, return color; }
-static void fbcon_update_softback(struct vc_data *vc) -{ - int l = fbcon_softback_size / vc->vc_size_row; - - if (l > 5) - softback_end = softback_buf + l * vc->vc_size_row; - else - /* Smaller scrollback makes no sense, and 0 would screw - the operation totally */ - softback_top = 0; -} - static void fb_flashcursor(struct work_struct *work) { struct fb_info *info = container_of(work, struct fb_info, queue); @@ -396,7 +376,7 @@ static void fb_flashcursor(struct work_struct *work) c = scr_readw((u16 *) vc->vc_pos); mode = (!ops->cursor_flash || ops->cursor_state.enable) ? CM_ERASE : CM_DRAW; - ops->cursor(vc, info, mode, softback_lines, get_color(vc, info, c, 1), + ops->cursor(vc, info, mode, 0, get_color(vc, info, c, 1), get_color(vc, info, c, 0)); console_unlock(); } @@ -453,13 +433,7 @@ static int __init fb_console_setup(char *this_opt) } if (!strncmp(options, "scrollback:", 11)) { - options += 11; - if (*options) { - fbcon_softback_size = simple_strtoul(options, &options, 0); - if (*options == 'k' || *options == 'K') { - fbcon_softback_size *= 1024; - } - } + pr_warn("Ignoring scrollback size option\n"); continue; } @@ -988,31 +962,6 @@ static const char *fbcon_startup(void)
set_blitting_type(vc, info);
- if (info->fix.type != FB_TYPE_TEXT) { - if (fbcon_softback_size) { - if (!softback_buf) { - softback_buf = - (unsigned long) - kmalloc(fbcon_softback_size, - GFP_KERNEL); - if (!softback_buf) { - fbcon_softback_size = 0; - softback_top = 0; - } - } - } else { - if (softback_buf) { - kfree((void *) softback_buf); - softback_buf = 0; - softback_top = 0; - } - } - if (softback_buf) - softback_in = softback_top = softback_curr = - softback_buf; - softback_lines = 0; - } - /* Setup default font */ if (!p->fontdata && !vc->vc_font.data) { if (!fontname[0] || !(font = find_font(fontname))) @@ -1181,9 +1130,6 @@ static void fbcon_init(struct vc_data *vc, int init) if (logo) fbcon_prepare_logo(vc, info, cols, rows, new_cols, new_rows);
- if (vc == svc && softback_buf) - fbcon_update_softback(vc); - if (ops->rotate_font && ops->rotate_font(info, vc)) { ops->rotate = FB_ROTATE_UR; set_blitting_type(vc, info); @@ -1346,7 +1292,6 @@ static void fbcon_cursor(struct vc_data *vc, int mode) { struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; struct fbcon_ops *ops = info->fbcon_par; - int y; int c = scr_readw((u16 *) vc->vc_pos);
ops->cur_blink_jiffies = msecs_to_jiffies(vc->vc_cur_blink_ms); @@ -1360,16 +1305,8 @@ static void fbcon_cursor(struct vc_data *vc, int mode) fbcon_add_cursor_timer(info);
ops->cursor_flash = (mode == CM_ERASE) ? 0 : 1; - if (mode & CM_SOFTBACK) { - mode &= ~CM_SOFTBACK; - y = softback_lines; - } else { - if (softback_lines) - fbcon_set_origin(vc); - y = 0; - }
- ops->cursor(vc, info, mode, y, get_color(vc, info, c, 1), + ops->cursor(vc, info, mode, 0, get_color(vc, info, c, 1), get_color(vc, info, c, 0)); }
@@ -1440,8 +1377,6 @@ static void fbcon_set_disp(struct fb_info *info, struct fb_var_screeninfo *var,
if (con_is_visible(vc)) { update_screen(vc); - if (softback_buf) - fbcon_update_softback(vc); } }
@@ -1579,99 +1514,6 @@ static __inline__ void ypan_down_redraw(struct vc_data *vc, int t, int count) scrollback_current = 0; }
-static void fbcon_redraw_softback(struct vc_data *vc, struct display *p, - long delta) -{ - int count = vc->vc_rows; - unsigned short *d, *s; - unsigned long n; - int line = 0; - - d = (u16 *) softback_curr; - if (d == (u16 *) softback_in) - d = (u16 *) vc->vc_origin; - n = softback_curr + delta * vc->vc_size_row; - softback_lines -= delta; - if (delta < 0) { - if (softback_curr < softback_top && n < softback_buf) { - n += softback_end - softback_buf; - if (n < softback_top) { - softback_lines -= - (softback_top - n) / vc->vc_size_row; - n = softback_top; - } - } else if (softback_curr >= softback_top - && n < softback_top) { - softback_lines -= - (softback_top - n) / vc->vc_size_row; - n = softback_top; - } - } else { - if (softback_curr > softback_in && n >= softback_end) { - n += softback_buf - softback_end; - if (n > softback_in) { - n = softback_in; - softback_lines = 0; - } - } else if (softback_curr <= softback_in && n > softback_in) { - n = softback_in; - softback_lines = 0; - } - } - if (n == softback_curr) - return; - softback_curr = n; - s = (u16 *) softback_curr; - if (s == (u16 *) softback_in) - s = (u16 *) vc->vc_origin; - while (count--) { - unsigned short *start; - unsigned short *le; - unsigned short c; - int x = 0; - unsigned short attr = 1; - - start = s; - le = advance_row(s, 1); - do { - c = scr_readw(s); - if (attr != (c & 0xff00)) { - attr = c & 0xff00; - if (s > start) { - fbcon_putcs(vc, start, s - start, - line, x); - x += s - start; - start = s; - } - } - if (c == scr_readw(d)) { - if (s > start) { - fbcon_putcs(vc, start, s - start, - line, x); - x += s - start + 1; - start = s + 1; - } else { - x++; - start++; - } - } - s++; - d++; - } while (s < le); - if (s > start) - fbcon_putcs(vc, start, s - start, line, x); - line++; - if (d == (u16 *) softback_end) - d = (u16 *) softback_buf; - if (d == (u16 *) softback_in) - d = (u16 *) vc->vc_origin; - if (s == (u16 *) softback_end) - s = (u16 *) softback_buf; - if (s == (u16 *) softback_in) - s = (u16 *) vc->vc_origin; - } -} - static void fbcon_redraw_move(struct vc_data *vc, struct display *p, int line, int count, int dy) { @@ -1811,31 +1653,6 @@ static void fbcon_redraw(struct vc_data *vc, struct display *p, } }
-static inline void fbcon_softback_note(struct vc_data *vc, int t, - int count) -{ - unsigned short *p; - - if (vc->vc_num != fg_console) - return; - p = (unsigned short *) (vc->vc_origin + t * vc->vc_size_row); - - while (count) { - scr_memcpyw((u16 *) softback_in, p, vc->vc_size_row); - count--; - p = advance_row(p, 1); - softback_in += vc->vc_size_row; - if (softback_in == softback_end) - softback_in = softback_buf; - if (softback_in == softback_top) { - softback_top += vc->vc_size_row; - if (softback_top == softback_end) - softback_top = softback_buf; - } - } - softback_curr = softback_in; -} - static bool fbcon_scroll(struct vc_data *vc, unsigned int t, unsigned int b, enum con_scroll dir, unsigned int count) { @@ -1858,8 +1675,6 @@ static bool fbcon_scroll(struct vc_data *vc, unsigned int t, unsigned int b, case SM_UP: if (count > vc->vc_rows) /* Maximum realistic size */ count = vc->vc_rows; - if (softback_top) - fbcon_softback_note(vc, t, count); if (logo_shown >= 0) goto redraw_up; switch (p->scrollmode) { @@ -2209,14 +2024,6 @@ static int fbcon_switch(struct vc_data *vc) info = registered_fb[con2fb_map[vc->vc_num]]; ops = info->fbcon_par;
- if (softback_top) { - if (softback_lines) - fbcon_set_origin(vc); - softback_top = softback_curr = softback_in = softback_buf; - softback_lines = 0; - fbcon_update_softback(vc); - } - if (logo_shown >= 0) { struct vc_data *conp2 = vc_cons[logo_shown].d;
@@ -2550,9 +2357,6 @@ static int fbcon_do_set_font(struct vc_data *vc, int w, int h, int cnt; char *old_data = NULL;
- if (con_is_visible(vc) && softback_lines) - fbcon_set_origin(vc); - resize = (w != vc->vc_font.width) || (h != vc->vc_font.height); if (p->userfont) old_data = vc->vc_font.data; @@ -2578,8 +2382,6 @@ static int fbcon_do_set_font(struct vc_data *vc, int w, int h, cols /= w; rows /= h; vc_resize(vc, cols, rows); - if (con_is_visible(vc) && softback_buf) - fbcon_update_softback(vc); } else if (con_is_visible(vc) && vc->vc_mode == KD_TEXT) { fbcon_clear_margins(vc, 0); @@ -2738,19 +2540,7 @@ static void fbcon_set_palette(struct vc_data *vc, const unsigned char *table)
static u16 *fbcon_screen_pos(struct vc_data *vc, int offset) { - unsigned long p; - int line; - - if (vc->vc_num != fg_console || !softback_lines) - return (u16 *) (vc->vc_origin + offset); - line = offset / vc->vc_size_row; - if (line >= softback_lines) - return (u16 *) (vc->vc_origin + offset - - softback_lines * vc->vc_size_row); - p = softback_curr + offset; - if (p >= softback_end) - p += softback_buf - softback_end; - return (u16 *) p; + return (u16 *) (vc->vc_origin + offset); }
static unsigned long fbcon_getxy(struct vc_data *vc, unsigned long pos, @@ -2764,22 +2554,7 @@ static unsigned long fbcon_getxy(struct vc_data *vc, unsigned long pos,
x = offset % vc->vc_cols; y = offset / vc->vc_cols; - if (vc->vc_num == fg_console) - y += softback_lines; ret = pos + (vc->vc_cols - x) * 2; - } else if (vc->vc_num == fg_console && softback_lines) { - unsigned long offset = pos - softback_curr; - - if (pos < softback_curr) - offset += softback_end - softback_buf; - offset /= 2; - x = offset % vc->vc_cols; - y = offset / vc->vc_cols; - ret = pos + (vc->vc_cols - x) * 2; - if (ret == softback_end) - ret = softback_buf; - if (ret == softback_in) - ret = vc->vc_origin; } else { /* Should not happen */ x = y = 0; @@ -2807,106 +2582,11 @@ static void fbcon_invert_region(struct vc_data *vc, u16 * p, int cnt) a = ((a) & 0x88ff) | (((a) & 0x7000) >> 4) | (((a) & 0x0700) << 4); scr_writew(a, p++); - if (p == (u16 *) softback_end) - p = (u16 *) softback_buf; - if (p == (u16 *) softback_in) - p = (u16 *) vc->vc_origin; } }
-static void fbcon_scrolldelta(struct vc_data *vc, int lines) -{ - struct fb_info *info = registered_fb[con2fb_map[fg_console]]; - struct fbcon_ops *ops = info->fbcon_par; - struct display *disp = &fb_display[fg_console]; - int offset, limit, scrollback_old; - - if (softback_top) { - if (vc->vc_num != fg_console) - return; - if (vc->vc_mode != KD_TEXT || !lines) - return; - if (logo_shown >= 0) { - struct vc_data *conp2 = vc_cons[logo_shown].d; - - if (conp2->vc_top == logo_lines - && conp2->vc_bottom == conp2->vc_rows) - conp2->vc_top = 0; - if (logo_shown == vc->vc_num) { - unsigned long p, q; - int i; - - p = softback_in; - q = vc->vc_origin + - logo_lines * vc->vc_size_row; - for (i = 0; i < logo_lines; i++) { - if (p == softback_top) - break; - if (p == softback_buf) - p = softback_end; - p -= vc->vc_size_row; - q -= vc->vc_size_row; - scr_memcpyw((u16 *) q, (u16 *) p, - vc->vc_size_row); - } - softback_in = softback_curr = p; - update_region(vc, vc->vc_origin, - logo_lines * vc->vc_cols); - } - logo_shown = FBCON_LOGO_CANSHOW; - } - fbcon_cursor(vc, CM_ERASE | CM_SOFTBACK); - fbcon_redraw_softback(vc, disp, lines); - fbcon_cursor(vc, CM_DRAW | CM_SOFTBACK); - return; - } - - if (!scrollback_phys_max) - return; - - scrollback_old = scrollback_current; - scrollback_current -= lines; - if (scrollback_current < 0) - scrollback_current = 0; - else if (scrollback_current > scrollback_max) - scrollback_current = scrollback_max; - if (scrollback_current == scrollback_old) - return; - - if (fbcon_is_inactive(vc, info)) - return; - - fbcon_cursor(vc, CM_ERASE); - - offset = disp->yscroll - scrollback_current; - limit = disp->vrows; - switch (disp->scrollmode) { - case SCROLL_WRAP_MOVE: - info->var.vmode |= FB_VMODE_YWRAP; - break; - case SCROLL_PAN_MOVE: - case SCROLL_PAN_REDRAW: - limit -= vc->vc_rows; - info->var.vmode &= ~FB_VMODE_YWRAP; - break; - } - if (offset < 0) - offset += limit; - else if (offset >= limit) - offset -= limit; - - ops->var.xoffset = 0; - ops->var.yoffset = offset * vc->vc_font.height; - ops->update_start(info); - - if (!scrollback_current) - fbcon_cursor(vc, CM_DRAW); -} - static int fbcon_set_origin(struct vc_data *vc) { - if (softback_lines) - fbcon_scrolldelta(vc, softback_lines); return 0; }
@@ -2970,8 +2650,6 @@ static void fbcon_modechanged(struct fb_info *info)
fbcon_set_palette(vc, color_table); update_screen(vc); - if (softback_buf) - fbcon_update_softback(vc); } }
@@ -3413,7 +3091,6 @@ static const struct consw fb_con = { .con_font_default = fbcon_set_def_font, .con_font_copy = fbcon_copy_font, .con_set_palette = fbcon_set_palette, - .con_scrolldelta = fbcon_scrolldelta, .con_set_origin = fbcon_set_origin, .con_invert_region = fbcon_invert_region, .con_screen_pos = fbcon_screen_pos, @@ -3670,9 +3347,6 @@ static void fbcon_exit(void) } #endif
- kfree((void *)softback_buf); - softback_buf = 0UL; - for_each_registered_fb(i) { int pending = 0;
From: Linus Torvalds torvalds@linux-foundation.org
mainline inclusion from mainline-v5.9 commit 06a0df4d1b8b13b551668e47b11fd7629033b7df category: bugfix bugzilla: NA CVE: CVE-2020-14390
--------------------------------
Since the softscroll code got removed, this argument is always zero and makes no sense any more.
Tested-by: Yuan Ming yuanmingbuaa@gmail.com Tested-by: Willy Tarreau w@1wt.eu Reviewed-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/video/fbdev/core/bitblit.c | 11 +---------- drivers/video/fbdev/core/fbcon.c | 4 ++-- drivers/video/fbdev/core/fbcon.h | 2 +- drivers/video/fbdev/core/fbcon_ccw.c | 11 +---------- drivers/video/fbdev/core/fbcon_cw.c | 11 +---------- drivers/video/fbdev/core/fbcon_ud.c | 11 +---------- drivers/video/fbdev/core/tileblit.c | 2 +- 7 files changed, 8 insertions(+), 44 deletions(-)
diff --git a/drivers/video/fbdev/core/bitblit.c b/drivers/video/fbdev/core/bitblit.c index 35ebeeccde4d..436365efae73 100644 --- a/drivers/video/fbdev/core/bitblit.c +++ b/drivers/video/fbdev/core/bitblit.c @@ -234,7 +234,7 @@ static void bit_clear_margins(struct vc_data *vc, struct fb_info *info, }
static void bit_cursor(struct vc_data *vc, struct fb_info *info, int mode, - int softback_lines, int fg, int bg) + int fg, int bg) { struct fb_cursor cursor; struct fbcon_ops *ops = info->fbcon_par; @@ -247,15 +247,6 @@ static void bit_cursor(struct vc_data *vc, struct fb_info *info, int mode,
cursor.set = 0;
- if (softback_lines) { - if (y + softback_lines >= vc->vc_rows) { - mode = CM_ERASE; - ops->cursor_flash = 0; - return; - } else - y += softback_lines; - } - c = scr_readw((u16 *) vc->vc_pos); attribute = get_attribute(info, c); src = vc->vc_font.data + ((c & charmask) * (w * vc->vc_font.height)); diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 11d8a1a04799..28dd895aa031 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -376,7 +376,7 @@ static void fb_flashcursor(struct work_struct *work) c = scr_readw((u16 *) vc->vc_pos); mode = (!ops->cursor_flash || ops->cursor_state.enable) ? CM_ERASE : CM_DRAW; - ops->cursor(vc, info, mode, 0, get_color(vc, info, c, 1), + ops->cursor(vc, info, mode, get_color(vc, info, c, 1), get_color(vc, info, c, 0)); console_unlock(); } @@ -1306,7 +1306,7 @@ static void fbcon_cursor(struct vc_data *vc, int mode)
ops->cursor_flash = (mode == CM_ERASE) ? 0 : 1;
- ops->cursor(vc, info, mode, 0, get_color(vc, info, c, 1), + ops->cursor(vc, info, mode, get_color(vc, info, c, 1), get_color(vc, info, c, 0)); }
diff --git a/drivers/video/fbdev/core/fbcon.h b/drivers/video/fbdev/core/fbcon.h index 13ae6cdfd70e..ac11200fd2d5 100644 --- a/drivers/video/fbdev/core/fbcon.h +++ b/drivers/video/fbdev/core/fbcon.h @@ -62,7 +62,7 @@ struct fbcon_ops { void (*clear_margins)(struct vc_data *vc, struct fb_info *info, int color, int bottom_only); void (*cursor)(struct vc_data *vc, struct fb_info *info, int mode, - int softback_lines, int fg, int bg); + int fg, int bg); int (*update_start)(struct fb_info *info); int (*rotate_font)(struct fb_info *info, struct vc_data *vc); struct fb_var_screeninfo var; /* copy of the current fb_var_screeninfo */ diff --git a/drivers/video/fbdev/core/fbcon_ccw.c b/drivers/video/fbdev/core/fbcon_ccw.c index 78f3a5621478..71ad6967a70e 100644 --- a/drivers/video/fbdev/core/fbcon_ccw.c +++ b/drivers/video/fbdev/core/fbcon_ccw.c @@ -219,7 +219,7 @@ static void ccw_clear_margins(struct vc_data *vc, struct fb_info *info, }
static void ccw_cursor(struct vc_data *vc, struct fb_info *info, int mode, - int softback_lines, int fg, int bg) + int fg, int bg) { struct fb_cursor cursor; struct fbcon_ops *ops = info->fbcon_par; @@ -236,15 +236,6 @@ static void ccw_cursor(struct vc_data *vc, struct fb_info *info, int mode,
cursor.set = 0;
- if (softback_lines) { - if (y + softback_lines >= vc->vc_rows) { - mode = CM_ERASE; - ops->cursor_flash = 0; - return; - } else - y += softback_lines; - } - c = scr_readw((u16 *) vc->vc_pos); attribute = get_attribute(info, c); src = ops->fontbuffer + ((c & charmask) * (w * vc->vc_font.width)); diff --git a/drivers/video/fbdev/core/fbcon_cw.c b/drivers/video/fbdev/core/fbcon_cw.c index fd098ff17574..31fe5dd651d4 100644 --- a/drivers/video/fbdev/core/fbcon_cw.c +++ b/drivers/video/fbdev/core/fbcon_cw.c @@ -202,7 +202,7 @@ static void cw_clear_margins(struct vc_data *vc, struct fb_info *info, }
static void cw_cursor(struct vc_data *vc, struct fb_info *info, int mode, - int softback_lines, int fg, int bg) + int fg, int bg) { struct fb_cursor cursor; struct fbcon_ops *ops = info->fbcon_par; @@ -219,15 +219,6 @@ static void cw_cursor(struct vc_data *vc, struct fb_info *info, int mode,
cursor.set = 0;
- if (softback_lines) { - if (y + softback_lines >= vc->vc_rows) { - mode = CM_ERASE; - ops->cursor_flash = 0; - return; - } else - y += softback_lines; - } - c = scr_readw((u16 *) vc->vc_pos); attribute = get_attribute(info, c); src = ops->fontbuffer + ((c & charmask) * (w * vc->vc_font.width)); diff --git a/drivers/video/fbdev/core/fbcon_ud.c b/drivers/video/fbdev/core/fbcon_ud.c index e165a3fad29a..b2dd1370e39b 100644 --- a/drivers/video/fbdev/core/fbcon_ud.c +++ b/drivers/video/fbdev/core/fbcon_ud.c @@ -249,7 +249,7 @@ static void ud_clear_margins(struct vc_data *vc, struct fb_info *info, }
static void ud_cursor(struct vc_data *vc, struct fb_info *info, int mode, - int softback_lines, int fg, int bg) + int fg, int bg) { struct fb_cursor cursor; struct fbcon_ops *ops = info->fbcon_par; @@ -267,15 +267,6 @@ static void ud_cursor(struct vc_data *vc, struct fb_info *info, int mode,
cursor.set = 0;
- if (softback_lines) { - if (y + softback_lines >= vc->vc_rows) { - mode = CM_ERASE; - ops->cursor_flash = 0; - return; - } else - y += softback_lines; - } - c = scr_readw((u16 *) vc->vc_pos); attribute = get_attribute(info, c); src = ops->fontbuffer + ((c & charmask) * (w * vc->vc_font.height)); diff --git a/drivers/video/fbdev/core/tileblit.c b/drivers/video/fbdev/core/tileblit.c index 93390312957f..eb664dbf96f6 100644 --- a/drivers/video/fbdev/core/tileblit.c +++ b/drivers/video/fbdev/core/tileblit.c @@ -80,7 +80,7 @@ static void tile_clear_margins(struct vc_data *vc, struct fb_info *info, }
static void tile_cursor(struct vc_data *vc, struct fb_info *info, int mode, - int softback_lines, int fg, int bg) + int fg, int bg) { struct fb_tilecursor cursor; int use_sw = (vc->vc_cursor_type & 0x10);
From: Luo Jiaxing luojiaxing@huawei.com
mainline inclusion from mainline-v5.9-rc5 commit 53de092f47ff40e8d4d78d590d95819d391bf2e0 category: bugfix bugzilla: NA CVE: NA
--------------------------------
It was discovered that sdparm will fail when attempting to disable write cache on a SATA disk connected via libsas.
In the ATA command set the write cache state is controlled through the SET FEATURES operation. This is roughly corresponds to MODE SELECT in SCSI and the latter command is what is used in the SCSI-ATA translation layer. A subtle difference is that a MODE SELECT carries data whereas SET FEATURES is defined as a non-data command in ATA.
Set the DMA data direction to DMA_NONE if the requested ATA command is identified as non-data.
[mkp: commit desc]
Fixes: fa1c1e8f1ece ("[SCSI] Add SATA support to libsas") Link: https://lore.kernel.org/r/1598426666-54544-1-git-send-email-luojiaxing@huawe... Reviewed-by: John Garry john.garry@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Luo Jiaxing luojiaxing@huawei.com Signed-off-by: Martin K. Petersen martin.petersen@oracle.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/scsi/libsas/sas_ata.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c index 67bf9eafe6d9..74f87da00720 100644 --- a/drivers/scsi/libsas/sas_ata.c +++ b/drivers/scsi/libsas/sas_ata.c @@ -223,7 +223,10 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc) task->num_scatter = si; }
- task->data_dir = qc->dma_dir; + if (qc->tf.protocol == ATA_PROT_NODATA) + task->data_dir = DMA_NONE; + else + task->data_dir = qc->dma_dir; task->scatter = qc->sg; task->ata_task.retry_count = 1; task->task_state_flags = SAS_TASK_STATE_PENDING;
From: Zhihao Cheng chengzhihao1@huawei.com
hulk inclusion category: bugfix bugzilla: NA CVE: NA
---------------------------
The __this_cpu*() accessors are (in general) IRQ-unsafe which, given that percpu-rwsem is a blocking primitive, should be just fine.
However, file_end_write() is used from IRQ context and will cause load-store issues.
Fixing it by using the IRQ-safe this_cpu_*() for operations on read_count. This will generate more expensive code on a number of platforms, which might cause a performance regression for some of the other percpu-rwsem users.
If any such is reported, we can consider alternative solutions.
Fixes: 70fe2f48152e ("aio: fix freeze protection of aio writes") Signed-off-by: Hou Tao houtao1@huawei.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Link: https://lkml.kernel.org/r/20200915140750.137881-1-houtao1@huawei.com
Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/percpu-rwsem.h | 6 +++--- kernel/locking/percpu-rwsem.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index 79b99d653e03..78abe15f7e66 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -44,7 +44,7 @@ static inline void percpu_down_read_preempt_disable(struct percpu_rw_semaphore * * and that one the synchronize_sched() is done, the writer will see * anything we did within this RCU-sched read-size critical section. */ - __this_cpu_inc(*sem->read_count); + this_cpu_inc(*sem->read_count); if (unlikely(!rcu_sync_is_idle(&sem->rss))) __percpu_down_read(sem, false); /* Unconditional memory barrier */ barrier(); @@ -68,7 +68,7 @@ static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem) /* * Same as in percpu_down_read(). */ - __this_cpu_inc(*sem->read_count); + this_cpu_inc(*sem->read_count); if (unlikely(!rcu_sync_is_idle(&sem->rss))) ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */ preempt_enable(); @@ -94,7 +94,7 @@ static inline void percpu_up_read_preempt_enable(struct percpu_rw_semaphore *sem * Same as in percpu_down_read(). */ if (likely(rcu_sync_is_idle(&sem->rss))) - __this_cpu_dec(*sem->read_count); + this_cpu_dec(*sem->read_count); else __percpu_up_read(sem); /* Unconditional memory barrier */ preempt_enable(); diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 883cf1b92d90..6a2824fe7139 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -99,7 +99,7 @@ void __percpu_up_read(struct percpu_rw_semaphore *sem) * zero, as that is the only time it matters) they will also see our * critical section. */ - __this_cpu_dec(*sem->read_count); + this_cpu_dec(*sem->read_count);
/* Prod writer to recheck readers_active */ rcuwait_wake_up(&sem->writer);
From: Joerg Roedel jroedel@suse.de
mainline inclusion from mainline-v4.20-rc1 commit b4ef725eeba158f365da9de1f05149094643ddea category:feature bugzilla:NA CVE:NA
-------------------
These wrappers will be used to easily change the location of the field later when all users are converted.
Signed-off-by: Joerg Roedel jroedel@suse.de
Conflicts: include/linux/iommu.h
Signed-off-by: Chen Jun chenjun102@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/iommu/iommu.c | 14 +++++++------- include/linux/iommu.h | 11 +++++++++++ 2 files changed, 18 insertions(+), 7 deletions(-)
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 8cd0604049f0..cde37ab00760 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2311,7 +2311,7 @@ const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode) int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode, const struct iommu_ops *ops) { - struct iommu_fwspec *fwspec = dev->iommu_fwspec; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
if (fwspec) return ops == fwspec->ops ? 0 : -EINVAL; @@ -2323,26 +2323,26 @@ int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode, of_node_get(to_of_node(iommu_fwnode)); fwspec->iommu_fwnode = iommu_fwnode; fwspec->ops = ops; - dev->iommu_fwspec = fwspec; + dev_iommu_fwspec_set(dev, fwspec); return 0; } EXPORT_SYMBOL_GPL(iommu_fwspec_init);
void iommu_fwspec_free(struct device *dev) { - struct iommu_fwspec *fwspec = dev->iommu_fwspec; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
if (fwspec) { fwnode_handle_put(fwspec->iommu_fwnode); kfree(fwspec); - dev->iommu_fwspec = NULL; + dev_iommu_fwspec_set(dev, NULL); } } EXPORT_SYMBOL_GPL(iommu_fwspec_free);
int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids) { - struct iommu_fwspec *fwspec = dev->iommu_fwspec; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); size_t size; int i;
@@ -2351,11 +2351,11 @@ int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids)
size = offsetof(struct iommu_fwspec, ids[fwspec->num_ids + num_ids]); if (size > sizeof(*fwspec)) { - fwspec = krealloc(dev->iommu_fwspec, size, GFP_KERNEL); + fwspec = krealloc(fwspec, size, GFP_KERNEL); if (!fwspec) return -ENOMEM;
- dev->iommu_fwspec = fwspec; + dev_iommu_fwspec_set(dev, fwspec); }
for (i = 0; i < num_ids; i++) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index c5cc3c9e393c..916d95fd0344 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -671,6 +671,17 @@ void iommu_fwspec_free(struct device *dev); int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids); const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode);
+static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev) +{ + return dev->iommu_fwspec; +} + +static inline void dev_iommu_fwspec_set(struct device *dev, + struct iommu_fwspec *fwspec) +{ + dev->iommu_fwspec = fwspec; +} + extern int iommu_sva_bind_device(struct device *dev, struct mm_struct *mm, int *pasid, unsigned long flags, void *drvdata); extern int iommu_sva_unbind_device(struct device *dev, int pasid);
From: Joerg Roedel jroedel@suse.de
mainline inclusion from mainline-v4.20-rc1 commit 8097e53eaba2bbae99dee661778ca20490b73951 category:feature bugzilla:NA CVE:NA
-------------------
Use the new helpers dev_iommu_fwspec_get()/set() to access the dev->iommu_fwspec pointer. This makes it easier to move that pointer later into another struct.
Acked-by: Hanjun Guo hanjun.guo@linaro.org Signed-off-by: Joerg Roedel jroedel@suse.de Signed-off-by: Chen Jun chenjun102@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/acpi/arm64/iort.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-)
diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index 7408ffab7303..0a9290360f5d 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -783,7 +783,7 @@ static inline bool iort_iommu_driver_enabled(u8 type) static struct acpi_iort_node *iort_get_msi_resv_iommu(struct device *dev) { struct acpi_iort_node *iommu; - struct iommu_fwspec *fwspec = dev->iommu_fwspec; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
iommu = iort_get_iort_node(fwspec->iommu_fwnode);
@@ -798,9 +798,10 @@ static struct acpi_iort_node *iort_get_msi_resv_iommu(struct device *dev) return NULL; }
-static inline const struct iommu_ops *iort_fwspec_iommu_ops( - struct iommu_fwspec *fwspec) +static inline const struct iommu_ops *iort_fwspec_iommu_ops(struct device *dev) { + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + return (fwspec && fwspec->ops) ? fwspec->ops : NULL; }
@@ -828,6 +829,7 @@ static inline int iort_add_device_replay(const struct iommu_ops *ops, */ int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head) { + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct acpi_iort_its_group *its; struct acpi_iort_node *iommu_node, *its_node = NULL; int i, resv = 0; @@ -845,9 +847,9 @@ int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head) * a given PCI or named component may map IDs to. */
- for (i = 0; i < dev->iommu_fwspec->num_ids; i++) { + for (i = 0; i < fwspec->num_ids; i++) { its_node = iort_node_map_id(iommu_node, - dev->iommu_fwspec->ids[i], + fwspec->ids[i], NULL, IORT_MSI_TYPE); if (its_node) break; @@ -878,8 +880,7 @@ int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head) return (resv == its->its_count) ? resv : -ENODEV; } #else -static inline const struct iommu_ops *iort_fwspec_iommu_ops( - struct iommu_fwspec *fwspec) +static inline const struct iommu_ops *iort_fwspec_iommu_ops(struct device *dev); { return NULL; } static inline int iort_add_device_replay(const struct iommu_ops *ops, struct device *dev) @@ -1060,7 +1061,7 @@ const struct iommu_ops *iort_iommu_configure(struct device *dev) * If we already translated the fwspec there * is nothing left to do, return the iommu_ops. */ - ops = iort_fwspec_iommu_ops(dev->iommu_fwspec); + ops = iort_fwspec_iommu_ops(dev); if (ops) return ops;
@@ -1118,7 +1119,7 @@ const struct iommu_ops *iort_iommu_configure(struct device *dev) * add_device callback for dev, replay it to get things in order. */ if (!err) { - ops = iort_fwspec_iommu_ops(dev->iommu_fwspec); + ops = iort_fwspec_iommu_ops(dev); err = iort_add_device_replay(ops, dev); }
From: Qian Cai cai@lca.pw
mainline inclusion from mainline-v5.0-rc1 commit 6b68835b5af4feb2a3f31592a52a1a68d7c1b1f3 category:bugfix bugzilla:NA CVE:NA
-------------------
Commit 8097e53eaba2 ("ACPI/IORT: Use helper functions to access dev->iommu_fwspec") changed by mistake the iort_fwspec_iommu_ops() stub definition (compiled in when CONFIG_IOMMU_API=n), that caused the following compilation failure:
drivers/acpi/arm64/iort.c:880:1: error: expected identifier or '(' before '{' token { return NULL; } ^ drivers/acpi/arm64/iort.c:879:39: warning: 'iort_fwspec_iommu_ops' used but never defined static inline const struct iommu_ops *iort_fwspec_iommu_ops(struct device *dev); ^~~~~~~~~~~~~~~~~~~~~
Fix it.
Fixes: 8097e53eaba2 ("ACPI/IORT: Use helper functions to access dev->iommu_fwspec") Signed-off-by: Qian Cai cai@lca.pw [lorenzo.pieralisi@arm.com: updated tags and log] Signed-off-by: Lorenzo Pieralisi lorenzo.pieralisi@arm.com Acked-by: Hanjun Guo hanjun.guo@linaro.org Cc: Will Deacon will.deacon@arm.com Cc: Sudeep Holla sudeep.holla@arm.com Cc: Catalin Marinas catalin.marinas@arm.com Cc: Joerg Roedel jroedel@suse.de Signed-off-by: Will Deacon will.deacon@arm.com Signed-off-by: Chen Jun chenjun102@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/acpi/arm64/iort.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index 0a9290360f5d..9c03c8a49c23 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -880,7 +880,7 @@ int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head) return (resv == its->its_count) ? resv : -ENODEV; } #else -static inline const struct iommu_ops *iort_fwspec_iommu_ops(struct device *dev); +static inline const struct iommu_ops *iort_fwspec_iommu_ops(struct device *dev) { return NULL; } static inline int iort_add_device_replay(const struct iommu_ops *ops, struct device *dev)
From: Joerg Roedel jroedel@suse.de
mainline inclusion from mainline-v4.20-rc1 commit 9b468f7d9cf1f089b7287865776eb100504681b7 category:feature bugzilla:NA CVE:NA
-------------------
Use the new helpers dev_iommu_fwspec_get()/set() to access the dev->iommu_fwspec pointer. This makes it easier to move that pointer later into another struct.
Cc: Will Deacon will.deacon@arm.com Cc: Robin Murphy robin.murphy@arm.com Acked-by: Will Deacon will.deacon@arm.com Signed-off-by: Joerg Roedel jroedel@suse.de
Conflicts: drivers/iommu/arm-smmu-v3.c
Signed-off-by: Chen Jun chenjun102@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/iommu/arm-smmu-v3.c | 16 +++++++++------- drivers/iommu/arm-smmu.c | 12 ++++++------ 2 files changed, 15 insertions(+), 13 deletions(-)
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index e232c8f5e447..9db068a947d3 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -2114,7 +2114,8 @@ static void arm_smmu_install_ste_for_dev(struct iommu_fwspec *fwspec) static void arm_smmu_detach_dev(struct device *dev) { unsigned long flags; - struct arm_smmu_master_data *master = dev->iommu_fwspec->iommu_priv; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + struct arm_smmu_master_data *master = fwspec->iommu_priv; struct arm_smmu_domain *smmu_domain = master->domain;
if (smmu_domain) { @@ -2128,22 +2129,23 @@ static void arm_smmu_detach_dev(struct device *dev) }
master->ste.assigned = false; - arm_smmu_install_ste_for_dev(dev->iommu_fwspec); + arm_smmu_install_ste_for_dev(fwspec); }
static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) { int ret = 0; unsigned long flags; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct arm_smmu_device *smmu; struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct arm_smmu_master_data *master; struct arm_smmu_strtab_ent *ste;
- if (!dev->iommu_fwspec) + if (!fwspec) return -ENOENT;
- master = dev->iommu_fwspec->iommu_priv; + master = fwspec->iommu_priv; smmu = master->smmu; ste = &master->ste;
@@ -2187,7 +2189,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) ste->s2_cfg = &smmu_domain->s2_cfg; }
- arm_smmu_install_ste_for_dev(dev->iommu_fwspec); + arm_smmu_install_ste_for_dev(fwspec); out_unlock: mutex_unlock(&smmu_domain->init_mutex); return ret; @@ -2458,7 +2460,7 @@ static int arm_smmu_add_device(struct device *dev) int i, ret; struct arm_smmu_device *smmu; struct arm_smmu_master_data *master; - struct iommu_fwspec *fwspec = dev->iommu_fwspec; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct iommu_group *group;
if (!fwspec || fwspec->ops != &arm_smmu_ops) @@ -2536,7 +2538,7 @@ static int arm_smmu_add_device(struct device *dev)
static void arm_smmu_remove_device(struct device *dev) { - struct iommu_fwspec *fwspec = dev->iommu_fwspec; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct arm_smmu_master_data *master; struct arm_smmu_device *smmu;
diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c index 5bca4dc638c5..6b8b5f69d760 100644 --- a/drivers/iommu/arm-smmu.c +++ b/drivers/iommu/arm-smmu.c @@ -1112,7 +1112,7 @@ static bool arm_smmu_free_sme(struct arm_smmu_device *smmu, int idx)
static int arm_smmu_master_alloc_smes(struct device *dev) { - struct iommu_fwspec *fwspec = dev->iommu_fwspec; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct arm_smmu_master_cfg *cfg = fwspec->iommu_priv; struct arm_smmu_device *smmu = cfg->smmu; struct arm_smmu_smr *smrs = smmu->smrs; @@ -1215,7 +1215,7 @@ static int arm_smmu_domain_add_master(struct arm_smmu_domain *smmu_domain, static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) { int ret; - struct iommu_fwspec *fwspec = dev->iommu_fwspec; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct arm_smmu_device *smmu; struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
@@ -1389,7 +1389,7 @@ static int arm_smmu_add_device(struct device *dev) { struct arm_smmu_device *smmu; struct arm_smmu_master_cfg *cfg; - struct iommu_fwspec *fwspec = dev->iommu_fwspec; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); int i, ret;
if (using_legacy_binding) { @@ -1400,7 +1400,7 @@ static int arm_smmu_add_device(struct device *dev) * will allocate/initialise a new one. Thus we need to update fwspec for * later use. */ - fwspec = dev->iommu_fwspec; + fwspec = dev_iommu_fwspec_get(dev); if (ret) goto out_free; } else if (fwspec && fwspec->ops == &arm_smmu_ops) { @@ -1454,7 +1454,7 @@ static int arm_smmu_add_device(struct device *dev)
static void arm_smmu_remove_device(struct device *dev) { - struct iommu_fwspec *fwspec = dev->iommu_fwspec; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct arm_smmu_master_cfg *cfg; struct arm_smmu_device *smmu;
@@ -1474,7 +1474,7 @@ static void arm_smmu_remove_device(struct device *dev)
static struct iommu_group *arm_smmu_device_group(struct device *dev) { - struct iommu_fwspec *fwspec = dev->iommu_fwspec; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct arm_smmu_device *smmu = fwspec_smmu(fwspec); struct iommu_group *group = NULL; int i, idx;
From: Joerg Roedel jroedel@suse.de
mainline inclusion from mainline-v4.20-rc1 commit 98cc4f7196422ee20e00e2067e484e3136dbe4a5 category:feature bugzilla:NA CVE:NA
-------------------
Use the new helpers dev_iommu_fwspec_get()/set() to access the dev->iommu_fwspec pointer. This makes it easier to move that pointer later into another struct.
Signed-off-by: Joerg Roedel jroedel@suse.de Signed-off-by: Chen Jun chenjun102@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/iommu/dma-iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index b68d9fd27bba..229e5d86689f 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -177,7 +177,7 @@ EXPORT_SYMBOL(iommu_put_dma_cookie); void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list) {
- if (!is_of_node(dev->iommu_fwspec->iommu_fwnode)) + if (!is_of_node(dev_iommu_fwspec_get(dev)->iommu_fwnode)) iort_iommu_msi_get_resv_regions(dev, list);
}
From: Joerg Roedel jroedel@suse.de
mainline inclusion from mainline-v4.20-rc1 commit df90365580623b5c53cd9a46c14375191123d776 category:feature bugzilla:NA CVE:NA
-------------------
Use the new helpers dev_iommu_fwspec_get()/set() to access the dev->iommu_fwspec pointer. This makes it easier to move that pointer later into another struct.
Signed-off-by: Joerg Roedel jroedel@suse.de Signed-off-by: Chen Jun chenjun102@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/iommu/ipmmu-vmsa.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c index d8598e44e381..30bc25b711d3 100644 --- a/drivers/iommu/ipmmu-vmsa.c +++ b/drivers/iommu/ipmmu-vmsa.c @@ -84,7 +84,9 @@ static struct ipmmu_vmsa_domain *to_vmsa_domain(struct iommu_domain *dom)
static struct ipmmu_vmsa_device *to_ipmmu(struct device *dev) { - return dev->iommu_fwspec ? dev->iommu_fwspec->iommu_priv : NULL; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + + return fwspec ? fwspec->iommu_priv : NULL; }
#define TLB_LOOP_TIMEOUT 100 /* 100us */ @@ -646,7 +648,7 @@ static void ipmmu_domain_free(struct iommu_domain *io_domain) static int ipmmu_attach_device(struct iommu_domain *io_domain, struct device *dev) { - struct iommu_fwspec *fwspec = dev->iommu_fwspec; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct ipmmu_vmsa_device *mmu = to_ipmmu(dev); struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain); unsigned int i; @@ -695,7 +697,7 @@ static int ipmmu_attach_device(struct iommu_domain *io_domain, static void ipmmu_detach_device(struct iommu_domain *io_domain, struct device *dev) { - struct iommu_fwspec *fwspec = dev->iommu_fwspec; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain); unsigned int i;
@@ -747,13 +749,15 @@ static phys_addr_t ipmmu_iova_to_phys(struct iommu_domain *io_domain, static int ipmmu_init_platform_device(struct device *dev, struct of_phandle_args *args) { + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct platform_device *ipmmu_pdev;
ipmmu_pdev = of_find_device_by_node(args->np); if (!ipmmu_pdev) return -ENODEV;
- dev->iommu_fwspec->iommu_priv = platform_get_drvdata(ipmmu_pdev); + fwspec->iommu_priv = platform_get_drvdata(ipmmu_pdev); + return 0; }
From: Joerg Roedel jroedel@suse.de
mainline inclusion from mainline-v4.20-rc1 commit a9bf2eec5a6fc01a0a5250eaf0bf61dfd382a78a category:feature bugzilla:NA CVE:NA
-------------------
Use the new helpers dev_iommu_fwspec_get()/set() to access the dev->iommu_fwspec pointer. This makes it easier to move that pointer later into another struct.
Cc: Matthias Brugger matthias.bgg@gmail.com Tested-by: Yong Wu yong.wu@mediatek.com Signed-off-by: Joerg Roedel jroedel@suse.de Signed-off-by: Chen Jun chenjun102@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/iommu/mtk_iommu.c | 21 ++++++++++++--------- drivers/iommu/mtk_iommu_v1.c | 28 ++++++++++++++++------------ 2 files changed, 28 insertions(+), 21 deletions(-)
diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index 8e75f34ac886..5ed3c8ec0b64 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -268,7 +268,7 @@ static void mtk_iommu_config(struct mtk_iommu_data *data, { struct mtk_smi_larb_iommu *larb_mmu; unsigned int larbid, portid; - struct iommu_fwspec *fwspec = dev->iommu_fwspec; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); int i;
for (i = 0; i < fwspec->num_ids; ++i) { @@ -360,7 +360,7 @@ static int mtk_iommu_attach_device(struct iommu_domain *domain, struct device *dev) { struct mtk_iommu_domain *dom = to_mtk_domain(domain); - struct mtk_iommu_data *data = dev->iommu_fwspec->iommu_priv; + struct mtk_iommu_data *data = dev_iommu_fwspec_get(dev)->iommu_priv;
if (!data) return -ENODEV; @@ -379,7 +379,7 @@ static int mtk_iommu_attach_device(struct iommu_domain *domain, static void mtk_iommu_detach_device(struct iommu_domain *domain, struct device *dev) { - struct mtk_iommu_data *data = dev->iommu_fwspec->iommu_priv; + struct mtk_iommu_data *data = dev_iommu_fwspec_get(dev)->iommu_priv;
if (!data) return; @@ -441,13 +441,14 @@ static phys_addr_t mtk_iommu_iova_to_phys(struct iommu_domain *domain,
static int mtk_iommu_add_device(struct device *dev) { + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct mtk_iommu_data *data; struct iommu_group *group;
- if (!dev->iommu_fwspec || dev->iommu_fwspec->ops != &mtk_iommu_ops) + if (!fwspec || fwspec->ops != &mtk_iommu_ops) return -ENODEV; /* Not a iommu client device */
- data = dev->iommu_fwspec->iommu_priv; + data = fwspec->iommu_priv; iommu_device_link(&data->iommu, dev);
group = iommu_group_get_for_dev(dev); @@ -460,12 +461,13 @@ static int mtk_iommu_add_device(struct device *dev)
static void mtk_iommu_remove_device(struct device *dev) { + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct mtk_iommu_data *data;
- if (!dev->iommu_fwspec || dev->iommu_fwspec->ops != &mtk_iommu_ops) + if (!fwspec || fwspec->ops != &mtk_iommu_ops) return;
- data = dev->iommu_fwspec->iommu_priv; + data = fwspec->iommu_priv; iommu_device_unlink(&data->iommu, dev);
iommu_group_remove_device(dev); @@ -492,6 +494,7 @@ static struct iommu_group *mtk_iommu_device_group(struct device *dev)
static int mtk_iommu_of_xlate(struct device *dev, struct of_phandle_args *args) { + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct platform_device *m4updev;
if (args->args_count != 1) { @@ -500,13 +503,13 @@ static int mtk_iommu_of_xlate(struct device *dev, struct of_phandle_args *args) return -EINVAL; }
- if (!dev->iommu_fwspec->iommu_priv) { + if (!fwspec->iommu_priv) { /* Get the m4u device */ m4updev = of_find_device_by_node(args->np); if (WARN_ON(!m4updev)) return -EINVAL;
- dev->iommu_fwspec->iommu_priv = platform_get_drvdata(m4updev); + fwspec->iommu_priv = platform_get_drvdata(m4updev); }
return iommu_fwspec_add_ids(dev, args->args, 1); diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index 676c029494e4..96d30111c34e 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -206,7 +206,7 @@ static void mtk_iommu_config(struct mtk_iommu_data *data, { struct mtk_smi_larb_iommu *larb_mmu; unsigned int larbid, portid; - struct iommu_fwspec *fwspec = dev->iommu_fwspec; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); int i;
for (i = 0; i < fwspec->num_ids; ++i) { @@ -271,7 +271,7 @@ static int mtk_iommu_attach_device(struct iommu_domain *domain, struct device *dev) { struct mtk_iommu_domain *dom = to_mtk_domain(domain); - struct mtk_iommu_data *data = dev->iommu_fwspec->iommu_priv; + struct mtk_iommu_data *data = dev_iommu_fwspec_get(dev)->iommu_priv; int ret;
if (!data) @@ -293,7 +293,7 @@ static int mtk_iommu_attach_device(struct iommu_domain *domain, static void mtk_iommu_detach_device(struct iommu_domain *domain, struct device *dev) { - struct mtk_iommu_data *data = dev->iommu_fwspec->iommu_priv; + struct mtk_iommu_data *data = dev_iommu_fwspec_get(dev)->iommu_priv;
if (!data) return; @@ -371,6 +371,7 @@ static struct iommu_ops mtk_iommu_ops; static int mtk_iommu_create_mapping(struct device *dev, struct of_phandle_args *args) { + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct mtk_iommu_data *data; struct platform_device *m4updev; struct dma_iommu_mapping *mtk_mapping; @@ -383,28 +384,29 @@ static int mtk_iommu_create_mapping(struct device *dev, return -EINVAL; }
- if (!dev->iommu_fwspec) { + if (!fwspec) { ret = iommu_fwspec_init(dev, &args->np->fwnode, &mtk_iommu_ops); if (ret) return ret; - } else if (dev->iommu_fwspec->ops != &mtk_iommu_ops) { + fwspec = dev_iommu_fwspec_get(dev); + } else if (dev_iommu_fwspec_get(dev)->ops != &mtk_iommu_ops) { return -EINVAL; }
- if (!dev->iommu_fwspec->iommu_priv) { + if (!fwspec->iommu_priv) { /* Get the m4u device */ m4updev = of_find_device_by_node(args->np); if (WARN_ON(!m4updev)) return -EINVAL;
- dev->iommu_fwspec->iommu_priv = platform_get_drvdata(m4updev); + fwspec->iommu_priv = platform_get_drvdata(m4updev); }
ret = iommu_fwspec_add_ids(dev, args->args, 1); if (ret) return ret;
- data = dev->iommu_fwspec->iommu_priv; + data = fwspec->iommu_priv; m4udev = data->dev; mtk_mapping = m4udev->archdata.iommu; if (!mtk_mapping) { @@ -422,6 +424,7 @@ static int mtk_iommu_create_mapping(struct device *dev,
static int mtk_iommu_add_device(struct device *dev) { + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct dma_iommu_mapping *mtk_mapping; struct of_phandle_args iommu_spec; struct of_phandle_iterator it; @@ -440,7 +443,7 @@ static int mtk_iommu_add_device(struct device *dev) of_node_put(iommu_spec.np); }
- if (!dev->iommu_fwspec || dev->iommu_fwspec->ops != &mtk_iommu_ops) + if (!fwspec || fwspec->ops != &mtk_iommu_ops) return -ENODEV; /* Not a iommu client device */
/* @@ -458,7 +461,7 @@ static int mtk_iommu_add_device(struct device *dev) if (err) return err;
- data = dev->iommu_fwspec->iommu_priv; + data = fwspec->iommu_priv; mtk_mapping = data->dev->archdata.iommu; err = arm_iommu_attach_device(dev, mtk_mapping); if (err) { @@ -471,12 +474,13 @@ static int mtk_iommu_add_device(struct device *dev)
static void mtk_iommu_remove_device(struct device *dev) { + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct mtk_iommu_data *data;
- if (!dev->iommu_fwspec || dev->iommu_fwspec->ops != &mtk_iommu_ops) + if (!fwspec || fwspec->ops != &mtk_iommu_ops) return;
- data = dev->iommu_fwspec->iommu_priv; + data = fwspec->iommu_priv; iommu_device_unlink(&data->iommu, dev);
iommu_group_remove_device(dev);
From: Joerg Roedel jroedel@suse.de
mainline inclusion from mainline-v4.20-rc1 commit 5c7e6bd71bfd96a65adb62593ea1b7d51a445b26 category:feature bugzilla:NA CVE:NA
-------------------
Use the new helpers dev_iommu_fwspec_get()/set() to access the dev->iommu_fwspec pointer. This makes it easier to move that pointer later into another struct.
Signed-off-by: Joerg Roedel jroedel@suse.de Signed-off-by: Chen Jun chenjun102@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/iommu/of_iommu.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index 2a34fa23978b..82b46d0de64e 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -147,7 +147,7 @@ const struct iommu_ops *of_iommu_configure(struct device *dev, struct device_node *master_np) { const struct iommu_ops *ops = NULL; - struct iommu_fwspec *fwspec = dev->iommu_fwspec; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); int err = NO_IOMMU;
if (!master_np) @@ -202,14 +202,18 @@ const struct iommu_ops *of_iommu_configure(struct device *dev, } }
+ /* * Two success conditions can be represented by non-negative err here: * >0 : there is no IOMMU, or one was unavailable for non-fatal reasons * 0 : we found an IOMMU, and dev->fwspec is initialised appropriately * <0 : any actual error */ - if (!err) - ops = dev->iommu_fwspec->ops; + if (!err) { + /* The fwspec pointer changed, read it again */ + fwspec = dev_iommu_fwspec_get(dev); + ops = fwspec->ops; + } /* * If we have reason to believe the IOMMU driver missed the initial * add_device callback for dev, replay it to get things in order.
From: Joerg Roedel jroedel@suse.de
mainline inclusion from mainline-v4.20-rc1 commit 2000e5f70322dcd612a9ccddfb56bf6716e0da9a category:feature bugzilla:NA CVE:NA
-------------------
Use the new helpers dev_iommu_fwspec_get()/set() to access the dev->iommu_fwspec pointer. This makes it easier to move that pointer later into another struct.
Cc: Rob Clark robdclark@gmail.com Signed-off-by: Joerg Roedel jroedel@suse.de Signed-off-by: Chen Jun chenjun102@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/iommu/qcom_iommu.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-)
diff --git a/drivers/iommu/qcom_iommu.c b/drivers/iommu/qcom_iommu.c index b0a4a3d2f60e..03e08346b342 100644 --- a/drivers/iommu/qcom_iommu.c +++ b/drivers/iommu/qcom_iommu.c @@ -352,7 +352,8 @@ static void qcom_iommu_domain_free(struct iommu_domain *domain)
static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) { - struct qcom_iommu_dev *qcom_iommu = to_iommu(dev->iommu_fwspec); + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + struct qcom_iommu_dev *qcom_iommu = to_iommu(fwspec); struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain); int ret;
@@ -363,7 +364,7 @@ static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev
/* Ensure that the domain is finalized */ pm_runtime_get_sync(qcom_iommu->dev); - ret = qcom_iommu_init_domain(domain, qcom_iommu, dev->iommu_fwspec); + ret = qcom_iommu_init_domain(domain, qcom_iommu, fwspec); pm_runtime_put_sync(qcom_iommu->dev); if (ret < 0) return ret; @@ -385,7 +386,7 @@ static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev
static void qcom_iommu_detach_dev(struct iommu_domain *domain, struct device *dev) { - struct iommu_fwspec *fwspec = dev->iommu_fwspec; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct qcom_iommu_dev *qcom_iommu = to_iommu(fwspec); struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain); unsigned i; @@ -496,7 +497,7 @@ static bool qcom_iommu_capable(enum iommu_cap cap)
static int qcom_iommu_add_device(struct device *dev) { - struct qcom_iommu_dev *qcom_iommu = to_iommu(dev->iommu_fwspec); + struct qcom_iommu_dev *qcom_iommu = to_iommu(dev_iommu_fwspec_get(dev)); struct iommu_group *group; struct device_link *link;
@@ -527,7 +528,7 @@ static int qcom_iommu_add_device(struct device *dev)
static void qcom_iommu_remove_device(struct device *dev) { - struct qcom_iommu_dev *qcom_iommu = to_iommu(dev->iommu_fwspec); + struct qcom_iommu_dev *qcom_iommu = to_iommu(dev_iommu_fwspec_get(dev));
if (!qcom_iommu) return; @@ -539,6 +540,7 @@ static void qcom_iommu_remove_device(struct device *dev)
static int qcom_iommu_of_xlate(struct device *dev, struct of_phandle_args *args) { + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct qcom_iommu_dev *qcom_iommu; struct platform_device *iommu_pdev; unsigned asid = args->args[0]; @@ -564,14 +566,14 @@ static int qcom_iommu_of_xlate(struct device *dev, struct of_phandle_args *args) WARN_ON(asid > qcom_iommu->num_ctxs)) return -EINVAL;
- if (!dev->iommu_fwspec->iommu_priv) { - dev->iommu_fwspec->iommu_priv = qcom_iommu; + if (!fwspec->iommu_priv) { + fwspec->iommu_priv = qcom_iommu; } else { /* make sure devices iommus dt node isn't referring to * multiple different iommu devices. Multiple context * banks are ok, but multiple devices are not: */ - if (WARN_ON(qcom_iommu != dev->iommu_fwspec->iommu_priv)) + if (WARN_ON(qcom_iommu != fwspec->iommu_priv)) return -EINVAL; }
From: Joerg Roedel jroedel@suse.de
mainline inclusion from mainline-v4.20-rc1 commit db5d6a70043a0073e59af99cd4fd6bcb5bd3ace2 category:feature bugzilla:NA CVE:NA
-------------------
Use the new helpers dev_iommu_fwspec_get()/set() to access the dev->iommu_fwspec pointer. This makes it easier to move that pointer later into another struct.
Cc: Thierry Reding thierry.reding@gmail.com Acked-by: Thierry Reding treding@nvidia.com Signed-off-by: Joerg Roedel jroedel@suse.de Signed-off-by: Chen Jun chenjun102@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/iommu/tegra-smmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index fa0ecb5e6380..b784d51f2a0d 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -858,7 +858,7 @@ static struct iommu_group *tegra_smmu_group_get(struct tegra_smmu *smmu,
static struct iommu_group *tegra_smmu_device_group(struct device *dev) { - struct iommu_fwspec *fwspec = dev->iommu_fwspec; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct tegra_smmu *smmu = dev->archdata.iommu; struct iommu_group *group;
From: Joerg Roedel jroedel@suse.de
mainline inclusion from mainline-v5.0-rc1 commit da5d2748e4a4512237764d2a53bdf686eccee18b category:bugfix bugzilla:NA CVE:NA
-------------------
The mtk_iommu_add_device() function keeps the fwspec in an on-stack pointer and calls mtk_iommu_create_mapping(), which might change its source, dev->iommu_fwspec. This causes the on-stack pointer to be obsoleted and the device initialization to fail. Update the on-stack fwspec pointer after mtk_iommu_create_mapping() has been called.
Reported-by: Frank Wunderlich frank-w@public-files.de Fixes: a9bf2eec5a6f ('iommu/mediatek: Use helper functions to access dev->iommu_fwspec') Tested-by: Frank Wunderlich frank-w@public-files.de Signed-off-by: Joerg Roedel jroedel@suse.de Signed-off-by: Chen Jun chenjun102@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/iommu/mtk_iommu_v1.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index 96d30111c34e..3ddac0078a2b 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -440,6 +440,10 @@ static int mtk_iommu_add_device(struct device *dev) iommu_spec.args_count = count;
mtk_iommu_create_mapping(dev, &iommu_spec); + + /* dev->iommu_fwspec might have changed */ + fwspec = dev_iommu_fwspec_get(dev); + of_node_put(iommu_spec.np); }
From: Ming Lei ming.lei@redhat.com
mainline inclusion from mainline-5.4-rc1 commit 58c898ba370e68d39470cd0d932b524682c1f9be category: bugfix bugzilla: 21614 CVE: NA
---------------------------
There are 4 users which check if queue is registered, so add one helper to check it.
Cc: Christoph Hellwig hch@infradead.org Cc: Hannes Reinecke hare@suse.com Cc: Greg KH gregkh@linuxfoundation.org Cc: Mike Snitzer snitzer@redhat.com Cc: Bart Van Assche bvanassche@acm.org Reviewed-by: Bart Van Assche bvanassche@acm.org Signed-off-by: Ming Lei ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk Reviewed-by: Yufen Yu yuyufen@huawei.com
Conflicts: block/blk-sysfs.c block/blk-wbt.c Signed-off-by: Yu Kuai yukuai3@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- block/blk-sysfs.c | 4 ++-- block/blk-wbt.c | 2 +- block/elevator.c | 2 +- include/linux/blkdev.h | 1 + 4 files changed, 5 insertions(+), 4 deletions(-)
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 2d905a8b1473..ec20dc5e5e0f 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -897,7 +897,7 @@ int blk_register_queue(struct gendisk *disk) if (WARN_ON(!q)) return -ENXIO;
- WARN_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags), + WARN_ONCE(blk_queue_registered(q), "%s is registering an already registered queue\n", kobject_name(&dev->kobj)); queue_flag_set_unlocked(QUEUE_FLAG_REGISTERED, q); @@ -974,7 +974,7 @@ void blk_unregister_queue(struct gendisk *disk) return;
/* Return early if disk->queue was never registered. */ - if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) + if (!blk_queue_registered(q)) return;
/* diff --git a/block/blk-wbt.c b/block/blk-wbt.c index bb8f588789c8..2217ba732b7b 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -704,7 +704,7 @@ void wbt_enable_default(struct request_queue *q) return;
/* Queue not registered? Maybe shutting down... */ - if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) + if (!blk_queue_registered(q)) return;
if ((q->mq_ops && IS_ENABLED(CONFIG_BLK_WBT_MQ)) || diff --git a/block/elevator.c b/block/elevator.c index fae58b2f906f..aa878a90259e 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -1087,7 +1087,7 @@ static int __elevator_change(struct request_queue *q, const char *name) struct elevator_type *e;
/* Make sure queue is not in the middle of being removed */ - if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) + if (!blk_queue_registered(q)) return -ENOENT;
/* diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 704901e80ee3..ec86ef372e25 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -771,6 +771,7 @@ bool blk_queue_flag_test_and_clear(unsigned int flag, struct request_queue *q); #define blk_queue_quiesced(q) test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags) #define blk_queue_pm_only(q) atomic_read(&(q)->pm_only) #define blk_queue_fua(q) test_bit(QUEUE_FLAG_FUA, &(q)->queue_flags) +#define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags)
extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q);
From: Ming Lei ming.lei@redhat.com
mainline inclusion from mainline-5.4-rc1 commit cecf5d87ff2035127bb5a9ee054d0023a4a7cad3 category: bugfix bugzilla: 21614 CVE: NA
---------------------------
The kernfs built-in lock of 'kn->count' is held in sysfs .show/.store path. Meantime, inside block's .show/.store callback, q->sysfs_lock is required.
However, when mq & iosched kobjects are removed via blk_mq_unregister_dev() & elv_unregister_queue(), q->sysfs_lock is held too. This way causes AB-BA lock because the kernfs built-in lock of 'kn-count' is required inside kobject_del() too, see the lockdep warning[1].
On the other hand, it isn't necessary to acquire q->sysfs_lock for both blk_mq_unregister_dev() & elv_unregister_queue() because clearing REGISTERED flag prevents storing to 'queue/scheduler' from being happened. Also sysfs write(store) is exclusive, so no necessary to hold the lock for elv_unregister_queue() when it is called in switching elevator path.
So split .sysfs_lock into two: one is still named as .sysfs_lock for covering sync .store, the other one is named as .sysfs_dir_lock for covering kobjects and related status change.
sysfs itself can handle the race between add/remove kobjects and showing/storing attributes under kobjects. For switching scheduler via storing to 'queue/scheduler', we use the queue flag of QUEUE_FLAG_REGISTERED with .sysfs_lock for avoiding the race, then we can avoid to hold .sysfs_lock during removing/adding kobjects.
[1] lockdep warning ====================================================== WARNING: possible circular locking dependency detected 5.3.0-rc3-00044-g73277fc75ea0 #1380 Not tainted ------------------------------------------------------ rmmod/777 is trying to acquire lock: 00000000ac50e981 (kn->count#202){++++}, at: kernfs_remove_by_name_ns+0x59/0x72
but task is already holding lock: 00000000fb16ae21 (&q->sysfs_lock){+.+.}, at: blk_unregister_queue+0x78/0x10b
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #1 (&q->sysfs_lock){+.+.}: __lock_acquire+0x95f/0xa2f lock_acquire+0x1b4/0x1e8 __mutex_lock+0x14a/0xa9b blk_mq_hw_sysfs_show+0x63/0xb6 sysfs_kf_seq_show+0x11f/0x196 seq_read+0x2cd/0x5f2 vfs_read+0xc7/0x18c ksys_read+0xc4/0x13e do_syscall_64+0xa7/0x295 entry_SYSCALL_64_after_hwframe+0x49/0xbe
-> #0 (kn->count#202){++++}: check_prev_add+0x5d2/0xc45 validate_chain+0xed3/0xf94 __lock_acquire+0x95f/0xa2f lock_acquire+0x1b4/0x1e8 __kernfs_remove+0x237/0x40b kernfs_remove_by_name_ns+0x59/0x72 remove_files+0x61/0x96 sysfs_remove_group+0x81/0xa4 sysfs_remove_groups+0x3b/0x44 kobject_del+0x44/0x94 blk_mq_unregister_dev+0x83/0xdd blk_unregister_queue+0xa0/0x10b del_gendisk+0x259/0x3fa null_del_dev+0x8b/0x1c3 [null_blk] null_exit+0x5c/0x95 [null_blk] __se_sys_delete_module+0x204/0x337 do_syscall_64+0xa7/0x295 entry_SYSCALL_64_after_hwframe+0x49/0xbe
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0 CPU1 ---- ---- lock(&q->sysfs_lock); lock(kn->count#202); lock(&q->sysfs_lock); lock(kn->count#202);
*** DEADLOCK ***
2 locks held by rmmod/777: #0: 00000000e69bd9de (&lock){+.+.}, at: null_exit+0x2e/0x95 [null_blk] #1: 00000000fb16ae21 (&q->sysfs_lock){+.+.}, at: blk_unregister_queue+0x78/0x10b
stack backtrace: CPU: 0 PID: 777 Comm: rmmod Not tainted 5.3.0-rc3-00044-g73277fc75ea0 #1380 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS ?-20180724_192412-buildhw-07.phx4 Call Trace: dump_stack+0x9a/0xe6 check_noncircular+0x207/0x251 ? print_circular_bug+0x32a/0x32a ? find_usage_backwards+0x84/0xb0 check_prev_add+0x5d2/0xc45 validate_chain+0xed3/0xf94 ? check_prev_add+0xc45/0xc45 ? mark_lock+0x11b/0x804 ? check_usage_forwards+0x1ca/0x1ca __lock_acquire+0x95f/0xa2f lock_acquire+0x1b4/0x1e8 ? kernfs_remove_by_name_ns+0x59/0x72 __kernfs_remove+0x237/0x40b ? kernfs_remove_by_name_ns+0x59/0x72 ? kernfs_next_descendant_post+0x7d/0x7d ? strlen+0x10/0x23 ? strcmp+0x22/0x44 kernfs_remove_by_name_ns+0x59/0x72 remove_files+0x61/0x96 sysfs_remove_group+0x81/0xa4 sysfs_remove_groups+0x3b/0x44 kobject_del+0x44/0x94 blk_mq_unregister_dev+0x83/0xdd blk_unregister_queue+0xa0/0x10b del_gendisk+0x259/0x3fa ? disk_events_poll_msecs_store+0x12b/0x12b ? check_flags+0x1ea/0x204 ? mark_held_locks+0x1f/0x7a null_del_dev+0x8b/0x1c3 [null_blk] null_exit+0x5c/0x95 [null_blk] __se_sys_delete_module+0x204/0x337 ? free_module+0x39f/0x39f ? blkcg_maybe_throttle_current+0x8a/0x718 ? rwlock_bug+0x62/0x62 ? __blkcg_punt_bio_submit+0xd0/0xd0 ? trace_hardirqs_on_thunk+0x1a/0x20 ? mark_held_locks+0x1f/0x7a ? do_syscall_64+0x4c/0x295 do_syscall_64+0xa7/0x295 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x7fb696cdbe6b Code: 73 01 c3 48 8b 0d 1d 20 0c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 008 RSP: 002b:00007ffec9588788 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0 RAX: ffffffffffffffda RBX: 0000559e589137c0 RCX: 00007fb696cdbe6b RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559e58913828 RBP: 0000000000000000 R08: 00007ffec9587701 R09: 0000000000000000 R10: 00007fb696d4eae0 R11: 0000000000000206 R12: 00007ffec95889b0 R13: 00007ffec95896b3 R14: 0000559e58913260 R15: 0000559e589137c0
Cc: Christoph Hellwig hch@infradead.org Cc: Hannes Reinecke hare@suse.com Cc: Greg KH gregkh@linuxfoundation.org Cc: Mike Snitzer snitzer@redhat.com Reviewed-by: Bart Van Assche bvanassche@acm.org Signed-off-by: Ming Lei ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk
Conflicts: block/blk.h block/elevator.c block/blk-sysfs.c block/blk-core.c Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Yufen Yu yuyufen@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- block/blk-core.c | 1 + block/blk-mq-sysfs.c | 12 +++++----- block/blk-sysfs.c | 37 +++++++++++++++++------------- block/blk.h | 2 +- block/elevator.c | 52 +++++++++++++++++++++++++++++++++++------- include/linux/blkdev.h | 1 + 6 files changed, 74 insertions(+), 31 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c index ba3b7ac5b478..5510ceaea81a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1049,6 +1049,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id, mutex_init(&q->blk_trace_mutex); #endif mutex_init(&q->sysfs_lock); + mutex_init(&q->sysfs_dir_lock); spin_lock_init(&q->__queue_lock);
q->queue_lock = lock ? : &q->__queue_lock; diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 014c7d84ff99..31f283f4b37f 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -278,7 +278,7 @@ void blk_mq_unregister_dev(struct device *dev, struct request_queue *q) struct blk_mq_hw_ctx *hctx; int i;
- lockdep_assert_held(&q->sysfs_lock); + lockdep_assert_held(&q->sysfs_dir_lock);
queue_for_each_hw_ctx(q, hctx, i) blk_mq_unregister_hctx(hctx); @@ -328,7 +328,7 @@ int __blk_mq_register_dev(struct device *dev, struct request_queue *q) int ret, i;
WARN_ON_ONCE(!q->kobj.parent); - lockdep_assert_held(&q->sysfs_lock); + lockdep_assert_held(&q->sysfs_dir_lock);
ret = kobject_add(q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq"); if (ret < 0) @@ -374,7 +374,7 @@ void blk_mq_sysfs_unregister(struct request_queue *q) struct blk_mq_hw_ctx *hctx; int i;
- mutex_lock(&q->sysfs_lock); + mutex_lock(&q->sysfs_dir_lock); if (!q->mq_sysfs_init_done) goto unlock;
@@ -382,7 +382,7 @@ void blk_mq_sysfs_unregister(struct request_queue *q) blk_mq_unregister_hctx(hctx);
unlock: - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->sysfs_dir_lock); }
int blk_mq_sysfs_register(struct request_queue *q) @@ -390,7 +390,7 @@ int blk_mq_sysfs_register(struct request_queue *q) struct blk_mq_hw_ctx *hctx; int i, ret = 0;
- mutex_lock(&q->sysfs_lock); + mutex_lock(&q->sysfs_dir_lock); if (!q->mq_sysfs_init_done) goto unlock;
@@ -401,7 +401,7 @@ int blk_mq_sysfs_register(struct request_queue *q) }
unlock: - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->sysfs_dir_lock);
return ret; } diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index ec20dc5e5e0f..ef727e0fc06e 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -893,6 +893,7 @@ int blk_register_queue(struct gendisk *disk) int ret; struct device *dev = disk_to_dev(disk); struct request_queue *q = disk->queue; + bool has_elevator = false;
if (WARN_ON(!q)) return -ENXIO; @@ -900,7 +901,6 @@ int blk_register_queue(struct gendisk *disk) WARN_ONCE(blk_queue_registered(q), "%s is registering an already registered queue\n", kobject_name(&dev->kobj)); - queue_flag_set_unlocked(QUEUE_FLAG_REGISTERED, q);
/* * SCSI probing may synchronously create and destroy a lot of @@ -921,8 +921,7 @@ int blk_register_queue(struct gendisk *disk) if (ret) return ret;
- /* Prevent changes through sysfs until registration is completed. */ - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->sysfs_dir_lock);
ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue"); if (ret < 0) { @@ -935,26 +934,32 @@ int blk_register_queue(struct gendisk *disk) blk_mq_debugfs_register(q); }
- kobject_uevent(&q->kobj, KOBJ_ADD); - - wbt_enable_default(q); - - blk_throtl_register_queue(q); - if (q->request_fn || (q->mq_ops && q->elevator)) { - ret = elv_register_queue(q); + ret = elv_register_queue(q, false); if (ret) { - mutex_unlock(&q->sysfs_lock); - kobject_uevent(&q->kobj, KOBJ_REMOVE); + mutex_unlock(&q->sysfs_dir_lock); kobject_del(&q->kobj); blk_trace_remove_sysfs(dev); kobject_put(&dev->kobj); return ret; } + has_elevator = true; } + + mutex_lock(&q->sysfs_lock); + blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); + wbt_enable_default(q); + blk_throtl_register_queue(q); + + /* Now everything is ready and send out KOBJ_ADD uevent */ + kobject_uevent(&q->kobj, KOBJ_ADD); + if (has_elevator) + kobject_uevent(&q->elevator->kobj, KOBJ_ADD); + mutex_unlock(&q->sysfs_lock); + ret = 0; unlock: - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->sysfs_dir_lock); return ret; } EXPORT_SYMBOL_GPL(blk_register_queue); @@ -985,23 +990,23 @@ void blk_unregister_queue(struct gendisk *disk) mutex_lock(&q->sysfs_lock);
blk_queue_flag_clear(QUEUE_FLAG_REGISTERED, q); + mutex_unlock(&q->sysfs_lock);
+ mutex_lock(&q->sysfs_dir_lock); /* * Remove the sysfs attributes before unregistering the queue data * structures that can be modified through sysfs. */ if (q->mq_ops) blk_mq_unregister_dev(disk_to_dev(disk), q); - mutex_unlock(&q->sysfs_lock);
kobject_uevent(&q->kobj, KOBJ_REMOVE); kobject_del(&q->kobj); blk_trace_remove_sysfs(disk_to_dev(disk));
- mutex_lock(&q->sysfs_lock); if (q->request_fn || (q->mq_ops && q->elevator)) elv_unregister_queue(q); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->sysfs_dir_lock);
kobject_put(&disk_to_dev(disk)->kobj); } diff --git a/block/blk.h b/block/blk.h index 1a5b67b57e6b..ae87e2a5f2bd 100644 --- a/block/blk.h +++ b/block/blk.h @@ -244,7 +244,7 @@ int elevator_init_mq(struct request_queue *q); int elevator_switch_mq(struct request_queue *q, struct elevator_type *new_e); void elevator_exit(struct request_queue *, struct elevator_queue *); -int elv_register_queue(struct request_queue *q); +int elv_register_queue(struct request_queue *q, bool uevent); void elv_unregister_queue(struct request_queue *q);
struct hd_struct *__disk_get_part(struct gendisk *disk, int partno); diff --git a/block/elevator.c b/block/elevator.c index aa878a90259e..cf10352e9154 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -833,13 +833,16 @@ static struct kobj_type elv_ktype = { .release = elevator_release, };
-int elv_register_queue(struct request_queue *q) +/* + * elv_register_queue is called from either blk_register_queue or + * elevator_switch, elevator switch is prevented from being happen + * in the two paths, so it is safe to not hold q->sysfs_lock. + */ +int elv_register_queue(struct request_queue *q, bool uevent) { struct elevator_queue *e = q->elevator; int error;
- lockdep_assert_held(&q->sysfs_lock); - error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched"); if (!error) { struct elv_fs_entry *attr = e->type->elevator_attrs; @@ -850,10 +853,14 @@ int elv_register_queue(struct request_queue *q) attr++; } } - kobject_uevent(&e->kobj, KOBJ_ADD); + if (uevent) + kobject_uevent(&e->kobj, KOBJ_ADD); + + mutex_lock(&q->sysfs_lock); e->registered = 1; if (!e->uses_mq && e->type->ops.sq.elevator_registered_fn) e->type->ops.sq.elevator_registered_fn(q); + mutex_unlock(&q->sysfs_lock); } return error; } @@ -867,9 +874,12 @@ void elv_unregister_queue(struct request_queue *q)
kobject_uevent(&e->kobj, KOBJ_REMOVE); kobject_del(&e->kobj); + + mutex_lock(&q->sysfs_lock); e->registered = 0; /* Re-enable throttling in case elevator disabled it */ wbt_enable_default(q); + mutex_unlock(&q->sysfs_lock); } }
@@ -940,10 +950,32 @@ int elevator_switch_mq(struct request_queue *q, lockdep_assert_held(&q->sysfs_lock);
if (q->elevator) { - if (q->elevator->registered) + if (q->elevator->registered) { + mutex_unlock(&q->sysfs_lock); + + /* + * Concurrent elevator switch can't happen becasue + * sysfs write is always exclusively on same file. + * + * Also the elevator queue won't be freed after + * sysfs_lock is released becasue kobject_del() in + * blk_unregister_queue() waits for completion of + * .store & .show on its attributes. + */ elv_unregister_queue(q); + + mutex_lock(&q->sysfs_lock); + } ioc_clear_queue(q); elevator_exit(q, q->elevator); + + /* + * sysfs_lock may be dropped, so re-check if queue is + * unregistered. If yes, don't switch to new elevator + * any more + */ + if (!blk_queue_registered(q)) + return 0; }
ret = blk_mq_init_sched(q, new_e); @@ -951,7 +983,11 @@ int elevator_switch_mq(struct request_queue *q, goto out;
if (new_e) { - ret = elv_register_queue(q); + mutex_unlock(&q->sysfs_lock); + + ret = elv_register_queue(q, true); + + mutex_lock(&q->sysfs_lock); if (ret) { elevator_exit(q, q->elevator); goto out; @@ -1051,7 +1087,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) if (err) goto fail_init;
- err = elv_register_queue(q); + err = elv_register_queue(q, true); if (err) goto fail_register;
@@ -1071,7 +1107,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) /* switch failed, restore and re-register old elevator */ if (old) { q->elevator = old; - elv_register_queue(q); + elv_register_queue(q, true); blk_queue_bypass_end(q); }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ec86ef372e25..0754ad98db1f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -650,6 +650,7 @@ struct request_queue { struct delayed_work requeue_work;
struct mutex sysfs_lock; + struct mutex sysfs_dir_lock;
/* * for reusing dead hctx instance in case of updating
From: Ming Lei ming.lei@redhat.com
mainline inclusion from mainline-5.4-rc1 commit b89f625e28d44552083f43752f62d8621ded0a04 category: bugfix bugzilla: 23392 CVE: NA
---------------------------
cecf5d87ff20 ("block: split .sysfs_lock into two locks") starts to release & acquire sysfs_lock before registering/un-registering elevator queue during switching elevator for avoiding potential deadlock from showing & storing 'queue/iosched' attributes and removing elevator's kobject.
Turns out there isn't such deadlock because 'q->sysfs_lock' isn't required in .show & .store of queue/iosched's attributes, and just elevator's sysfs lock is acquired in elv_iosched_store() and elv_iosched_show(). So it is safe to hold queue's sysfs lock when registering/un-registering elevator queue.
The biggest issue is that commit cecf5d87ff20 assumes that concurrent write on 'queue/scheduler' can't happen. However, this assumption isn't true, because kernfs_fop_write() only guarantees that concurrent write aren't called on the same open file, but the write could be from different open on the file. So we can't release & re-acquire queue's sysfs lock during switching elevator, otherwise use-after-free on elevator could be triggered.
Fixes the issue by not releasing queue's sysfs lock during switching elevator.
Fixes: cecf5d87ff20 ("block: split .sysfs_lock into two locks") Cc: Christoph Hellwig hch@infradead.org Cc: Hannes Reinecke hare@suse.com Cc: Greg KH gregkh@linuxfoundation.org Cc: Mike Snitzer snitzer@redhat.com Reviewed-by: Bart Van Assche bvanassche@acm.org Signed-off-by: Ming Lei ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk
Conflicts: block/blk-sysfs.c block/blk-elevator.c Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Yufen Yu yuyufen@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- block/blk-sysfs.c | 5 ++++- block/elevator.c | 31 +------------------------------ 2 files changed, 5 insertions(+), 31 deletions(-)
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index ef727e0fc06e..15ad67c3c56b 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -934,9 +934,11 @@ int blk_register_queue(struct gendisk *disk) blk_mq_debugfs_register(q); }
+ mutex_lock(&q->sysfs_lock); if (q->request_fn || (q->mq_ops && q->elevator)) { ret = elv_register_queue(q, false); if (ret) { + mutex_unlock(&q->sysfs_lock); mutex_unlock(&q->sysfs_dir_lock); kobject_del(&q->kobj); blk_trace_remove_sysfs(dev); @@ -946,7 +948,6 @@ int blk_register_queue(struct gendisk *disk) has_elevator = true; }
- mutex_lock(&q->sysfs_lock); blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); wbt_enable_default(q); blk_throtl_register_queue(q); @@ -1004,8 +1005,10 @@ void blk_unregister_queue(struct gendisk *disk) kobject_del(&q->kobj); blk_trace_remove_sysfs(disk_to_dev(disk));
+ mutex_lock(&q->sysfs_lock); if (q->request_fn || (q->mq_ops && q->elevator)) elv_unregister_queue(q); + mutex_unlock(&q->sysfs_lock); mutex_unlock(&q->sysfs_dir_lock);
kobject_put(&disk_to_dev(disk)->kobj); diff --git a/block/elevator.c b/block/elevator.c index cf10352e9154..f81592543dff 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -856,11 +856,9 @@ int elv_register_queue(struct request_queue *q, bool uevent) if (uevent) kobject_uevent(&e->kobj, KOBJ_ADD);
- mutex_lock(&q->sysfs_lock); e->registered = 1; if (!e->uses_mq && e->type->ops.sq.elevator_registered_fn) e->type->ops.sq.elevator_registered_fn(q); - mutex_unlock(&q->sysfs_lock); } return error; } @@ -875,11 +873,9 @@ void elv_unregister_queue(struct request_queue *q) kobject_uevent(&e->kobj, KOBJ_REMOVE); kobject_del(&e->kobj);
- mutex_lock(&q->sysfs_lock); e->registered = 0; /* Re-enable throttling in case elevator disabled it */ wbt_enable_default(q); - mutex_unlock(&q->sysfs_lock); } }
@@ -950,32 +946,11 @@ int elevator_switch_mq(struct request_queue *q, lockdep_assert_held(&q->sysfs_lock);
if (q->elevator) { - if (q->elevator->registered) { - mutex_unlock(&q->sysfs_lock); - - /* - * Concurrent elevator switch can't happen becasue - * sysfs write is always exclusively on same file. - * - * Also the elevator queue won't be freed after - * sysfs_lock is released becasue kobject_del() in - * blk_unregister_queue() waits for completion of - * .store & .show on its attributes. - */ + if (q->elevator->registered) elv_unregister_queue(q);
- mutex_lock(&q->sysfs_lock); - } ioc_clear_queue(q); elevator_exit(q, q->elevator); - - /* - * sysfs_lock may be dropped, so re-check if queue is - * unregistered. If yes, don't switch to new elevator - * any more - */ - if (!blk_queue_registered(q)) - return 0; }
ret = blk_mq_init_sched(q, new_e); @@ -983,11 +958,7 @@ int elevator_switch_mq(struct request_queue *q, goto out;
if (new_e) { - mutex_unlock(&q->sysfs_lock); - ret = elv_register_queue(q, true); - - mutex_lock(&q->sysfs_lock); if (ret) { elevator_exit(q, q->elevator); goto out;
From: Jan Kara jack@suse.cz
mainline inclusion from mainline-v5.9 commit 6dbf7bb555981fb5faf7b691e8f6169fc2b2e63b category: bugfix bugzilla: 41569 CVE: NA
-----------------------------------------------
If block_write_full_page() is called for a page that is beyond current inode size, it will truncate page buffers for the page and return 0. This logic has been added in 2.5.62 in commit 81eb69062588 ("fix ext3 BUG due to race with truncate") in history.git tree to fix a problem with ext3 in data=ordered mode. This particular problem doesn't exist anymore because ext3 is long gone and ext4 handles ordered data differently. Also normally buffers are invalidated by truncate code and there's no need to specially handle this in ->writepage() code.
This invalidation of page buffers in block_write_full_page() is causing issues to filesystems (e.g. ext4 or ocfs2) when block device is shrunk under filesystem's hands and metadata buffers get discarded while being tracked by the journalling layer. Although it is obviously "not supported" it can cause kernel crashes like:
[ 7986.689400] BUG: unable to handle kernel NULL pointer dereference at +0000000000000008 [ 7986.697197] PGD 0 P4D 0 [ 7986.699724] Oops: 0002 [#1] SMP PTI [ 7986.703200] CPU: 4 PID: 203778 Comm: jbd2/dm-3-8 Kdump: loaded Tainted: G +O --------- - - 4.18.0-147.5.0.5.h126.eulerosv2r9.x86_64 #1 [ 7986.716438] Hardware name: Huawei RH2288H V3/BC11HGSA0, BIOS 1.57 08/11/2015 [ 7986.723462] RIP: 0010:jbd2_journal_grab_journal_head+0x1b/0x40 [jbd2] ... [ 7986.810150] Call Trace: [ 7986.812595] __jbd2_journal_insert_checkpoint+0x23/0x70 [jbd2] [ 7986.818408] jbd2_journal_commit_transaction+0x155f/0x1b60 [jbd2] [ 7986.836467] kjournald2+0xbd/0x270 [jbd2]
which is not great. The crash happens because bh->b_private is suddently NULL although BH_JBD flag is still set (this is because block_invalidatepage() cleared BH_Mapped flag and subsequent bh lookup found buffer without BH_Mapped set, called init_page_buffers() which has rewritten bh->b_private). So just remove the invalidation in block_write_full_page().
Note that the buffer cache invalidation when block device changes size is already careful to avoid similar problems by using invalidate_mapping_pages() which skips busy buffers so it was only this odd block_write_full_page() behavior that could tear down bdev buffers under filesystem's hands.
Reported-by: Ye Bin yebin10@huawei.com Signed-off-by: Jan Kara jack@suse.cz Reviewed-by: Christoph Hellwig hch@lst.de CC: stable@vger.kernel.org Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Ye Bin yebin10@huawei.com Reviewed-by: zhangyi (F) yi.zhang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/buffer.c | 16 ---------------- 1 file changed, 16 deletions(-)
diff --git a/fs/buffer.c b/fs/buffer.c index cd8371af236a..50cdeb1023a2 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2742,16 +2742,6 @@ int nobh_writepage(struct page *page, get_block_t *get_block, /* Is the page fully outside i_size? (truncate in progress) */ offset = i_size & (PAGE_SIZE-1); if (page->index >= end_index+1 || !offset) { - /* - * The page may have dirty, unmapped buffers. For example, - * they may have been added in ext3_writepage(). Make them - * freeable here, so the page does not leak. - */ -#if 0 - /* Not really sure about this - do we need this ? */ - if (page->mapping->a_ops->invalidatepage) - page->mapping->a_ops->invalidatepage(page, offset); -#endif unlock_page(page); return 0; /* don't care */ } @@ -2946,12 +2936,6 @@ int block_write_full_page(struct page *page, get_block_t *get_block, /* Is the page fully outside i_size? (truncate in progress) */ offset = i_size & (PAGE_SIZE-1); if (page->index >= end_index+1 || !offset) { - /* - * The page may have dirty, unmapped buffers. For example, - * they may have been added in ext3_writepage(). Make them - * freeable here, so the page does not leak. - */ - do_invalidatepage(page, 0, PAGE_SIZE); unlock_page(page); return 0; /* don't care */ }
From: Christoph Hellwig hch@lst.de
mainline inclusion from mainline-v5.9-rc1 commit 58e46ed9cc634b3c64ea402789f8a92ce45cee48 category: bugfix bugzilla: 41569 CVE: NA
-----------------------------------------------
Move the locking and assignment of bd_claiming from bd_start_claiming to bd_prepare_to_claim.
Signed-off-by: Christoph Hellwig hch@lst.de Acked-by: Tejun Heo tj@kernel.org Signed-off-by: Jens Axboe axboe@kernel.dk
conflcit: fs/block_dev.c
Signed-off-by: Ye Bin yebin10@huawei.com Reviewed-by: zhangyi (F) yi.zhang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/block_dev.c | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-)
diff --git a/fs/block_dev.c b/fs/block_dev.c index 862a01a0a15d..dcc2a73168a6 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1033,19 +1033,14 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, }
/** - * bd_prepare_to_claim - prepare to claim a block device + * bd_prepare_to_claim - claim a block device * @bdev: block device of interest * @whole: the whole device containing @bdev, may equal @bdev * @holder: holder trying to claim @bdev * - * Prepare to claim @bdev. This function fails if @bdev is already - * claimed by another holder and waits if another claiming is in - * progress. This function doesn't actually claim. On successful - * return, the caller has ownership of bd_claiming and bd_holder[s]. - * - * CONTEXT: - * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab - * it multiple times. + * Claim @bdev. This function fails if @bdev is already claimed by another + * holder and waits if another claiming is in progress. return, the caller + * has ownership of bd_claiming and bd_holder[s]. * * RETURNS: * 0 if @bdev can be claimed, -EBUSY otherwise. @@ -1054,9 +1049,12 @@ static int bd_prepare_to_claim(struct block_device *bdev, struct block_device *whole, void *holder) { retry: + spin_lock(&bdev_lock); /* if someone else claimed, fail */ - if (!bd_may_claim(bdev, whole, holder)) + if (!bd_may_claim(bdev, whole, holder)) { + spin_unlock(&bdev_lock); return -EBUSY; + }
/* if claiming is already in progress, wait for it to finish */ if (whole->bd_claiming) { @@ -1067,11 +1065,12 @@ static int bd_prepare_to_claim(struct block_device *bdev, spin_unlock(&bdev_lock); schedule(); finish_wait(wq, &wait); - spin_lock(&bdev_lock); goto retry; }
/* yay, all mine */ + whole->bd_claiming = holder; + spin_unlock(&bdev_lock); return 0; }
@@ -1153,19 +1152,13 @@ static struct block_device *bd_start_claiming(struct block_device *bdev, if (!whole) return ERR_PTR(-ENOMEM);
- /* prepare to claim, if successful, mark claiming in progress */ - spin_lock(&bdev_lock); - err = bd_prepare_to_claim(bdev, whole, holder); - if (err == 0) { - whole->bd_claiming = holder; - spin_unlock(&bdev_lock); - return whole; - } else { - spin_unlock(&bdev_lock); + if (err) { bdput(whole); return ERR_PTR(err); } + + return whole; }
#ifdef CONFIG_SYSFS
From: Jan Kara jack@suse.cz
mainline inclusion from mainline-v5.9 commit 384d87ef2c954fc58e6c5fd8253e4a1984f5fe02 category: bugfix bugzilla: 41569 CVE: NA
-----------------------------------------------
Discarding blocks and buffers under a mounted filesystem is hardly anything admin wants to do. Usually it will confuse the filesystem and sometimes the loss of buffer_head state (including b_private field) can even cause crashes like:
BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 PGD 0 P4D 0 Oops: 0002 [#1] SMP PTI CPU: 4 PID: 203778 Comm: jbd2/dm-3-8 Kdump: loaded Tainted: G O --------- - - 4.18.0-147.5.0.5.h126.eulerosv2r9.x86_64 #1 Hardware name: Huawei RH2288H V3/BC11HGSA0, BIOS 1.57 08/11/2015 RIP: 0010:jbd2_journal_grab_journal_head+0x1b/0x40 [jbd2] ... Call Trace: __jbd2_journal_insert_checkpoint+0x23/0x70 [jbd2] jbd2_journal_commit_transaction+0x155f/0x1b60 [jbd2] kjournald2+0xbd/0x270 [jbd2]
So if we don't have block device open with O_EXCL already, claim the block device while we truncate buffer cache. This makes sure any exclusive block device user (such as filesystem) cannot operate on the device while we are discarding buffer cache.
Reported-by: Ye Bin yebin10@huawei.com Signed-off-by: Jan Kara jack@suse.cz Reviewed-by: Christoph Hellwig hch@lst.de [axboe: fix !CONFIG_BLOCK error in truncate_bdev_range()]
conflicts: fs/block_dev.c include/linux/blkdev.h
Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Ye Bin yebin10@huawei.com Reviewed-by: zhangyi (F) yi.zhang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- block/ioctl.c | 16 +++++++----- fs/block_dev.c | 65 +++++++++++++++++++++++++++++++++++++++++++--- include/linux/fs.h | 7 +++++ 3 files changed, 78 insertions(+), 10 deletions(-)
diff --git a/block/ioctl.c b/block/ioctl.c index 3884d810efd2..5a6157b3735a 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -203,8 +203,7 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, uint64_t range[2]; uint64_t start, len; struct request_queue *q = bdev_get_queue(bdev); - struct address_space *mapping = bdev->bd_inode->i_mapping; - + int err;
if (!(mode & FMODE_WRITE)) return -EBADF; @@ -225,7 +224,11 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode,
if (start + len > i_size_read(bdev->bd_inode)) return -EINVAL; - truncate_inode_pages_range(mapping, start, start + len - 1); + + err = truncate_bdev_range(bdev, mode, start, start + len - 1); + if (err) + return err; + return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, flags); } @@ -234,8 +237,8 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode, unsigned long arg) { uint64_t range[2]; - struct address_space *mapping; uint64_t start, end, len; + int err;
if (!(mode & FMODE_WRITE)) return -EBADF; @@ -257,8 +260,9 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode, return -EINVAL;
/* Invalidate the page cache, including dirty pages */ - mapping = bdev->bd_inode->i_mapping; - truncate_inode_pages_range(mapping, start, end); + err = truncate_bdev_range(bdev, mode, start, end); + if (err) + return err;
return blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL, BLKDEV_ZERO_NOUNMAP); diff --git a/fs/block_dev.c b/fs/block_dev.c index dcc2a73168a6..6168c61acb2c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1074,6 +1074,63 @@ static int bd_prepare_to_claim(struct block_device *bdev, return 0; }
+static void bd_clear_claiming(struct block_device *whole, void *holder) +{ + lockdep_assert_held(&bdev_lock); + /* tell others that we're done */ + BUG_ON(whole->bd_claiming != holder); + whole->bd_claiming = NULL; + wake_up_bit(&whole->bd_claiming, 0); +} + +/** + * bd_abort_claiming - abort claiming of a block device + * @bdev: block device of interest + * @whole: whole block device (returned from bd_start_claiming()) + * @holder: holder that has claimed @bdev + * + * Abort claiming of a block device when the exclusive open failed. This can be + * also used when exclusive open is not actually desired and we just needed + * to block other exclusive openers for a while. + */ +void bd_abort_claiming(struct block_device *bdev, struct block_device *whole, + void *holder) +{ + spin_lock(&bdev_lock); + bd_clear_claiming(whole, holder); + spin_unlock(&bdev_lock); +} +EXPORT_SYMBOL(bd_abort_claiming); + +/* + * Drop all buffers & page cache for given bdev range. This function bails + * with error if bdev has other exclusive owner (such as filesystem). + */ +int truncate_bdev_range(struct block_device *bdev, fmode_t mode, + loff_t lstart, loff_t lend) +{ + struct block_device *claimed_bdev = NULL; + int err; + + /* + * If we don't hold exclusive handle for the device, upgrade to it + * while we discard the buffer cache to avoid discarding buffers + * under live filesystem. + */ + if (!(mode & FMODE_EXCL)) { + claimed_bdev = bdev->bd_contains; + err = bd_prepare_to_claim(bdev, claimed_bdev, + truncate_bdev_range); + if (err) + return err; + } + truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend); + if (claimed_bdev) + bd_abort_claiming(bdev, claimed_bdev, truncate_bdev_range); + return 0; +} +EXPORT_SYMBOL(truncate_bdev_range); + static struct gendisk *bdev_get_gendisk(struct block_device *bdev, int *partno) { struct gendisk *disk = get_gendisk(bdev->bd_dev, partno); @@ -2016,7 +2073,6 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, loff_t len) { struct block_device *bdev = I_BDEV(bdev_file_inode(file)); - struct address_space *mapping; loff_t end = start + len - 1; loff_t isize; int error; @@ -2044,8 +2100,9 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, return -EINVAL;
/* Invalidate the page cache, including dirty pages. */ - mapping = bdev->bd_inode->i_mapping; - truncate_inode_pages_range(mapping, start, end); + error = truncate_bdev_range(bdev, file->f_mode, start, end); + if (error) + return error;
switch (mode) { case FALLOC_FL_ZERO_RANGE: @@ -2072,7 +2129,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, * the caller will be given -EBUSY. The third argument is * inclusive, so the rounding here is safe. */ - return invalidate_inode_pages2_range(mapping, + return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping, start >> PAGE_SHIFT, end >> PAGE_SHIFT); } diff --git a/include/linux/fs.h b/include/linux/fs.h index 0a25421c1b02..18c55a0fc580 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2548,6 +2548,8 @@ extern void bd_forget(struct inode *inode); extern void bdput(struct block_device *); extern void invalidate_bdev(struct block_device *); extern void iterate_bdevs(void (*)(struct block_device *, void *), void *); +int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, + loff_t lend); extern int sync_blockdev(struct block_device *bdev); extern void kill_bdev(struct block_device *); extern struct super_block *freeze_bdev(struct block_device *); @@ -2564,6 +2566,11 @@ static inline bool sb_is_blkdev_sb(struct super_block *sb) } #else static inline void bd_forget(struct inode *inode) {} +static inline int truncate_bdev_range(struct block_device *bdev, fmode_t mode, + loff_t lstart, loff_t lend) +{ + return 0; +} static inline int sync_blockdev(struct block_device *bdev) { return 0; } static inline void kill_bdev(struct block_device *bdev) {} static inline void invalidate_bdev(struct block_device *bdev) {}
From: Mike Rapoport rppt@linux.ibm.com
mainline inclusion from mainline-v5.1-rc1 commit 5c01a25a210366362a40dc63f550e72688a60c48 category: bugfix bugzilla: 42401 CVE: NA
-------------------------------------------------
Marc Gonzalez reported the following kmemleak crash:
Unable to handle kernel paging request at virtual address ffffffc021e00000 Mem abort info: ESR = 0x96000006 Exception class = DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 Data abort info: ISV = 0, ISS = 0x00000006 CM = 0, WnR = 0 swapper pgtable: 4k pages, 39-bit VAs, pgdp = (____ptrval____) [ffffffc021e00000] pgd=000000017e3ba803, pud=000000017e3ba803, pmd=0000000000000000 Internal error: Oops: 96000006 [#1] PREEMPT SMP Modules linked in: CPU: 6 PID: 523 Comm: kmemleak Tainted: G S W 5.0.0-rc1 #13 Hardware name: Qualcomm Technologies, Inc. MSM8998 v1 MTP (DT) pstate: 80000085 (Nzcv daIf -PAN -UAO) pc : scan_block+0x70/0x190 lr : scan_block+0x6c/0x190 Process kmemleak (pid: 523, stack limit = 0x(____ptrval____)) Call trace: scan_block+0x70/0x190 scan_gray_list+0x108/0x1c0 kmemleak_scan+0x33c/0x7c0 kmemleak_scan_thread+0x98/0xf0 kthread+0x11c/0x120 ret_from_fork+0x10/0x1c Code: f9000fb4 d503201f 97ffffd2 35000580 (f9400260)
The crash happens when a no-map area is allocated in early_init_dt_alloc_reserved_memory_arch(). The allocated region is registered with kmemleak, but it is then removed from memblock using memblock_remove() that is not kmemleak-aware.
Replacing memblock_phys_alloc_range() with memblock_find_in_range() makes sure that the allocated memory is not added to kmemleak and then memblock_remove()'ing this memory is safe.
As a bonus, since memblock_find_in_range() ensures the allocation in the specified range, the bounds check can be removed.
[rppt@linux.ibm.com: of: fix parameters order for call to memblock_find_in_range()] Link: http://lkml.kernel.org/r/20190221112619.GC32004@rapoport-lnx Link: http://lkml.kernel.org/r/20190213181921.GB15270@rapoport-lnx Fixes: 3f0c820664483 ("drivers: of: add initialization code for dynamic reserved memory") Signed-off-by: Mike Rapoport rppt@linux.ibm.com Acked-by: Marek Szyprowski m.szyprowski@samsung.com Acked-by: Prateek Patel prpatel@nvidia.com Tested-by: Marc Gonzalez marc.w.gonzalez@free.fr Cc: Rob Herring robh+dt@kernel.org Cc: Frank Rowand frowand.list@gmail.com Cc: Catalin Marinas catalin.marinas@arm.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/of/of_reserved_mem.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-)
diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c index 166570098cf8..a3a3180b1561 100644 --- a/drivers/of/of_reserved_mem.c +++ b/drivers/of/of_reserved_mem.c @@ -37,22 +37,15 @@ int __init __weak early_init_dt_alloc_reserved_memory_arch(phys_addr_t size, * panic()s on allocation failure. */ end = !end ? MEMBLOCK_ALLOC_ANYWHERE : end; - base = __memblock_alloc_base(size, align, end); + base = memblock_find_in_range(start, end, size, align); if (!base) return -ENOMEM;
- /* - * Check if the allocated region fits in to start..end window - */ - if (base < start) { - memblock_free(base, size); - return -ENOMEM; - } - *res_base = base; if (nomap) return memblock_remove(base, size); - return 0; + + return memblock_reserve(base, size); } #else int __init __weak early_init_dt_alloc_reserved_memory_arch(phys_addr_t size,
From: Heiner Kallweit hkallweit1@gmail.com
mainline inclusion from mainline-v4.20-rc1 commit e8cfd9d6c7727a067b38dbe7655ca02377fdb301 category: bugfix bugzilla: NA CVE: NA
----------------------------------------------------
phy_stop() may be called e.g. when suspending, therefore all needed actions should be performed synchronously. Therefore add a synchronous call to the state machine.
Signed-off-by: Heiner Kallweit hkallweit1@gmail.com Tested-by: Geert Uytterhoeven geert+renesas@glider.be Reviewed-by: Florian Fainelli f.fainelli@gmail.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Weiwei Deng dengweiwei@huawei.com Reviewed-by: Zhaohui Zhong zhongzhaohui@huawei.com Reviewed-by: Yonglong Liu liuyonglong@huawei.com Signed-off-by: You Shengzui youshengzui@huawei.com Reviewed-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/net/phy/phy.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index e561ef1e842a..6019e0aba234 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -861,6 +861,8 @@ void phy_stop(struct phy_device *phydev) out_unlock: mutex_unlock(&phydev->lock);
+ phy_state_machine(&phydev->state_queue.work); + /* Cannot call flush_scheduled_work() here as desired because * of rtnl_lock(), but PHY_HALTED shall guarantee phy_change() * will not reenable interrupts.
From: Heiner Kallweit hkallweit1@gmail.com
mainline inclusion from mainline-v5.1-rc1 commit 124eee3f6955f7aa19b9e6ff5c9b6d37cb3d1e2c category: bugfix bugzilla: NA CVE: NA
------------------------------------------------------
When bringing down the netdevice (incl. detaching it) and calling netif_carrier_off directly or indirectly the latter triggers an asynchronous linkwatch event. This linkwatch event eventually may fail to access chip registers in the ndo_get_stats/ndo_get_stats64 callback because the device isn't accessible any longer, see call trace in [0].
To prevent this scenario don't check for IFF_UP only, but also make sure that the netdevice is present.
[0] https://lists.openwall.net/netdev/2018/03/15/62
Signed-off-by: Heiner Kallweit hkallweit1@gmail.com Tested-by: Geert Uytterhoeven geert+renesas@glider.be Reviewed-by: Florian Fainelli f.fainelli@gmail.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Weiwei Deng dengweiwei@huawei.com Reviewed-by: Zhaohui Zhong zhongzhaohui@huawei.com Reviewed-by: Yonglong Liu liuyonglong@huawei.com Signed-off-by: You Shengzui youshengzui@huawei.com Reviewed-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/core/link_watch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/core/link_watch.c b/net/core/link_watch.c index e38e641e98d5..7f51efb2b3ab 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -155,7 +155,7 @@ static void linkwatch_do_dev(struct net_device *dev) clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state);
rfc2863_policy(dev); - if (dev->flags & IFF_UP) { + if (dev->flags & IFF_UP && netif_device_present(dev)) { if (netif_carrier_ok(dev)) dev_activate(dev); else
From: Heiner Kallweit hkallweit1@gmail.com
mainline inclusion from mainline-v5.1-rc1 commit cbfd12b3e8c3542e8142aa041714ed614d3f67b0 category: bugfix bugzilla: NA CVE: NA
------------------------------------------------------
The call to the phylib state machine in phy_stop() just ensures that the state machine isn't re-triggered, but a state machine call may be scheduled already. So lets's call phy_stop_machine(). This also allows to get rid of the call to phy_stop_machine() in phy_disconnect().
Signed-off-by: Heiner Kallweit hkallweit1@gmail.com Reviewed-by: Andrew Lunn andrew@lunn.ch Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Weiwei Deng dengweiwei@huawei.com Reviewed-by: Zhaohui Zhong zhongzhaohui@huawei.com Reviewed-by: Yonglong Liu liuyonglong@huawei.com Signed-off-by: You Shengzui youshengzui@huawei.com Reviewed-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/net/phy/phy.c | 1 + drivers/net/phy/phy_device.c | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-)
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 6019e0aba234..51e40a91db52 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -862,6 +862,7 @@ void phy_stop(struct phy_device *phydev) mutex_unlock(&phydev->lock);
phy_state_machine(&phydev->state_queue.work); + phy_stop_machine(phydev);
/* Cannot call flush_scheduled_work() here as desired because * of rtnl_lock(), but PHY_HALTED shall guarantee phy_change() diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 9ee52409b41d..fd88e5240877 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -832,8 +832,6 @@ void phy_disconnect(struct phy_device *phydev) if (phydev->irq > 0) phy_stop_interrupts(phydev);
- phy_stop_machine(phydev); - phydev->adjust_link = NULL;
phy_detach(phydev);