From: Guoqing Jiang jgq516@gmail.com
mainline inclusion from mainline-v5.14-rc1 commit 10764815ff4728d2c57da677cd5d3dd6f446cf5f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I587H6 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
-------------------------------
We introduce a new bioset (io_acct_set) for raid0 and raid5 since they don't own clone infrastructure to accounting io. And the bioset is added to mddev instead of to raid0 and raid5 layer, because with this way, we can put common functions to md.h and reuse them in raid0 and raid5.
Also struct md_io_acct is added accordingly which includes io start_time, the origin bio and cloned bio. Then we can call bio_{start,end}_io_acct to get related io status.
Signed-off-by: Guoqing Jiang jiangguoqing@kylinos.cn Signed-off-by: Song Liu song@kernel.org Conflict: drivers/md/md.c drivers/md/md.h Signed-off-by: Zhang Wensheng zhangwensheng5@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/md/md.c | 52 ++++++++++++++++++++++++++++++++++++++++++---- drivers/md/md.h | 8 +++++++ drivers/md/raid0.c | 3 +++ drivers/md/raid5.c | 9 ++++++++ 4 files changed, 68 insertions(+), 4 deletions(-)
diff --git a/drivers/md/md.c b/drivers/md/md.c index a299fda5b0e9..10d9a8bed604 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2361,7 +2361,8 @@ int md_integrity_register(struct mddev *mddev) bdev_get_integrity(reference->bdev));
pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); - if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE)) { + if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || + bioset_integrity_create(&mddev->io_acct_set, BIO_POOL_SIZE)) { pr_err("md: failed to create integrity pool for %s\n", mdname(mddev)); return -EINVAL; @@ -5597,6 +5598,7 @@ static void md_free(struct kobject *ko)
bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); + bioset_exit(&mddev->io_acct_set); kfree(mddev); }
@@ -5885,7 +5887,13 @@ int md_run(struct mddev *mddev) if (!bioset_initialized(&mddev->sync_set)) { err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); if (err) - return err; + goto exit_bio_set; + } + if (!bioset_initialized(&mddev->io_acct_set)) { + err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE, + offsetof(struct md_io_acct, bio_clone), 0); + if (err) + goto exit_sync_set; }
spin_lock(&pers_lock); @@ -6013,6 +6021,7 @@ int md_run(struct mddev *mddev) blk_queue_flag_set(QUEUE_FLAG_NONROT, mddev->queue); else blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue); + blk_queue_flag_set(QUEUE_FLAG_IO_STAT, mddev->queue); } if (pers->sync_request) { if (mddev->kobj.sd && @@ -6062,8 +6071,11 @@ int md_run(struct mddev *mddev) module_put(pers->owner); md_bitmap_destroy(mddev); abort: - bioset_exit(&mddev->bio_set); + bioset_exit(&mddev->io_acct_set); +exit_sync_set: bioset_exit(&mddev->sync_set); +exit_bio_set: + bioset_exit(&mddev->bio_set); return err; } EXPORT_SYMBOL_GPL(md_run); @@ -6288,6 +6300,7 @@ void md_stop(struct mddev *mddev) __md_stop(mddev); bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); + bioset_exit(&mddev->io_acct_set); }
EXPORT_SYMBOL_GPL(md_stop); @@ -8571,9 +8584,40 @@ void md_write_end(struct mddev *mddev) roundup(jiffies, mddev->safemode_delay) + mddev->safemode_delay); } - EXPORT_SYMBOL(md_write_end);
+static void md_end_io_acct(struct bio *bio) +{ + struct md_io_acct *md_io_acct = bio->bi_private; + struct bio *orig_bio = md_io_acct->orig_bio; + + orig_bio->bi_status = bio->bi_status; + + bio_end_io_acct(orig_bio, md_io_acct->start_time); + bio_put(bio); + bio_endio(orig_bio); +} + +/* used by personalities (raid0 and raid5) to account io stats */ +void md_account_bio(struct mddev *mddev, struct bio **bio) +{ + struct md_io_acct *md_io_acct; + struct bio *clone; + + if (!blk_queue_io_stat((*bio)->bi_disk->queue)) + return; + + clone = bio_clone_fast(*bio, GFP_NOIO, &mddev->io_acct_set); + md_io_acct = container_of(clone, struct md_io_acct, bio_clone); + md_io_acct->orig_bio = *bio; + md_io_acct->start_time = bio_start_io_acct(*bio); + + clone->bi_end_io = md_end_io_acct; + clone->bi_private = md_io_acct; + *bio = clone; +} +EXPORT_SYMBOL_GPL(md_account_bio); + /* md_allow_write(mddev) * Calling this ensures that the array is marked 'active' so that writes * may proceed without blocking. It is important to call this before diff --git a/drivers/md/md.h b/drivers/md/md.h index 85177d068322..20c16cfdd6bb 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -491,6 +491,7 @@ struct mddev { struct bio_set sync_set; /* for sync operations like * metadata and bitmap writes */ + struct bio_set io_acct_set; /* for raid0 and raid5 io accounting */
/* Generic flush handling. * The last to finish preflush schedules a worker to submit @@ -687,6 +688,12 @@ struct md_thread { void *private; };
+struct md_io_acct { + struct bio *orig_bio; + unsigned long start_time; + struct bio bio_clone; +}; + #define THREAD_WAKEUP 0
static inline void safe_put_page(struct page *p) @@ -716,6 +723,7 @@ extern void md_write_end(struct mddev *mddev); extern void md_done_sync(struct mddev *mddev, int blocks, int ok); extern void md_error(struct mddev *mddev, struct md_rdev *rdev); extern void md_finish_reshape(struct mddev *mddev); +void md_account_bio(struct mddev *mddev, struct bio **bio);
extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio); extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 35843df15b5e..0312b192106a 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -556,6 +556,9 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) bio = split; }
+ if (bio->bi_pool != &mddev->bio_set) + md_account_bio(mddev, &bio); + orig_sector = sector; zone = find_zone(mddev->private, §or); switch (conf->layout) { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index c82953a3299e..35dacc6e754f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5487,6 +5487,7 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) sector_t sector = raid_bio->bi_iter.bi_sector; unsigned chunk_sects = mddev->chunk_sectors; unsigned sectors = chunk_sects - (sector & (chunk_sects-1)); + struct r5conf *conf = mddev->private;
if (sectors < bio_sectors(raid_bio)) { struct r5conf *conf = mddev->private; @@ -5496,6 +5497,9 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) raid_bio = split; }
+ if (raid_bio->bi_pool != &conf->bio_split) + md_account_bio(mddev, &raid_bio); + if (!raid5_read_one_chunk(mddev, raid_bio)) return raid_bio;
@@ -5775,6 +5779,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) DEFINE_WAIT(w); bool do_prepare; bool do_flush = false; + bool do_clone = false;
if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { int ret = log_handle_flush_request(conf, bi); @@ -5803,6 +5808,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) if (rw == READ && mddev->degraded == 0 && mddev->reshape_position == MaxSector) { bi = chunk_aligned_read(mddev, bi); + do_clone = true; if (!bi) return true; } @@ -5817,6 +5823,9 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) last_sector = bio_end_sector(bi); bi->bi_next = NULL;
+ if (!do_clone) + md_account_bio(mddev, &bi); + prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) { int previous;
From: Guoqing Jiang jgq516@gmail.com
mainline inclusion from mainline-v5.14-rc1 commit 1147f58e1010b8688bac1fd3bbab753b1379291d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I587H6 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
-------------------------------
After enable io accounting, chunk read bio could be cloned twice which is not good. To avoid such inefficiency, let's clone align_bio from io_acct_set too, then we need only call md_account_bio in make_request unconditionally.
Signed-off-by: Guoqing Jiang jiangguoqing@kylinos.cn Signed-off-by: Song Liu song@kernel.org Conflict: drivers/md/raid5.c Signed-off-by: Zhang Wensheng zhangwensheng5@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/md/raid5.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 35dacc6e754f..7103f3e330a7 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5364,11 +5364,13 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf, */ static void raid5_align_endio(struct bio *bi) { - struct bio* raid_bi = bi->bi_private; + struct md_io_acct *md_io_acct = bi->bi_private; + struct bio *raid_bi = md_io_acct->orig_bio; struct mddev *mddev; struct r5conf *conf; struct md_rdev *rdev; blk_status_t error = bi->bi_status; + unsigned long start_time = md_io_acct->start_time;
bio_put(bi);
@@ -5380,6 +5382,8 @@ static void raid5_align_endio(struct bio *bi) rdev_dec_pending(rdev, conf->mddev);
if (!error) { + if (blk_queue_io_stat(raid_bi->bi_disk->queue)) + bio_end_io_acct(raid_bi, start_time); bio_endio(raid_bi); if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_quiescent); @@ -5398,6 +5402,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) struct bio* align_bi; struct md_rdev *rdev; sector_t end_sector; + struct md_io_acct *md_io_acct;
if (!in_chunk_boundary(mddev, raid_bio)) { pr_debug("%s: non aligned\n", __func__); @@ -5406,15 +5411,20 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) /* * use bio_clone_fast to make a copy of the bio */ - align_bi = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->bio_set); + align_bi = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->io_acct_set); if (!align_bi) return 0; + md_io_acct = container_of(align_bi, struct md_io_acct, bio_clone); + raid_bio->bi_next = (void *)rdev; + if (blk_queue_io_stat(raid_bio->bi_disk->queue)) + md_io_acct->start_time = bio_start_io_acct(raid_bio); + md_io_acct->orig_bio = raid_bio; /* * set bi_end_io to a new function, and set bi_private to the * original bio. */ align_bi->bi_end_io = raid5_align_endio; - align_bi->bi_private = raid_bio; + align_bi->bi_private = md_io_acct; /* * compute position */ @@ -5487,7 +5497,6 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) sector_t sector = raid_bio->bi_iter.bi_sector; unsigned chunk_sects = mddev->chunk_sectors; unsigned sectors = chunk_sects - (sector & (chunk_sects-1)); - struct r5conf *conf = mddev->private;
if (sectors < bio_sectors(raid_bio)) { struct r5conf *conf = mddev->private; @@ -5497,9 +5506,6 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) raid_bio = split; }
- if (raid_bio->bi_pool != &conf->bio_split) - md_account_bio(mddev, &raid_bio); - if (!raid5_read_one_chunk(mddev, raid_bio)) return raid_bio;
@@ -5779,7 +5785,6 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) DEFINE_WAIT(w); bool do_prepare; bool do_flush = false; - bool do_clone = false;
if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { int ret = log_handle_flush_request(conf, bi); @@ -5808,7 +5813,6 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) if (rw == READ && mddev->degraded == 0 && mddev->reshape_position == MaxSector) { bi = chunk_aligned_read(mddev, bi); - do_clone = true; if (!bi) return true; } @@ -5823,9 +5827,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) last_sector = bio_end_sector(bi); bi->bi_next = NULL;
- if (!do_clone) - md_account_bio(mddev, &bi); - + md_account_bio(mddev, &bi); prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) { int previous;
From: Guoqing Jiang jgq516@gmail.com
mainline inclusion from mainline-v5.14-rc1 commit 9b8ae7b938235229ccb112c4e887ff1bcc232836 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I587H6 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
-------------------------------
The caller of raid1_read_request could pass NULL or a valid pointer for "struct r1bio *r1_bio", so it actually means whether r1_bio is existed or not.
Signed-off-by: Guoqing Jiang jiangguoqing@kylinos.cn Signed-off-by: Song Liu song@kernel.org Signed-off-by: Zhang Wensheng zhangwensheng5@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/md/raid1.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 9fccbf916015..9b1b38ad8d27 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1210,7 +1210,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); int max_sectors; int rdisk; - bool print_msg = !!r1_bio; + bool r1bio_existed = !!r1_bio; char b[BDEVNAME_SIZE];
/* @@ -1220,7 +1220,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, */ gfp_t gfp = r1_bio ? (GFP_NOIO | __GFP_HIGH) : GFP_NOIO;
- if (print_msg) { + if (r1bio_existed) { /* Need to get the block device name carefully */ struct md_rdev *rdev; rcu_read_lock(); @@ -1252,7 +1252,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
if (rdisk < 0) { /* couldn't find anywhere to read from */ - if (print_msg) { + if (r1bio_existed) { pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n", mdname(mddev), b, @@ -1263,7 +1263,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, } mirror = conf->mirrors + rdisk;
- if (print_msg) + if (r1bio_existed) pr_info_ratelimited("md/raid1:%s: redirecting sector %llu to other mirror: %s\n", mdname(mddev), (unsigned long long)r1_bio->sector,
From: Guoqing Jiang jgq516@gmail.com
mainline inclusion from mainline-v5.14-rc1 commit a0159832e51e3af03b89ecc5d6b9db451e529b5f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I587H6 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
-------------------------------
For raid1, we record the start time between split bio and clone bio, and finish the accounting in the final endio.
Also introduce start_time in r1bio accordingly.
Signed-off-by: Guoqing Jiang jiangguoqing@kylinos.cn Signed-off-by: Song Liu song@kernel.org Conflict: drivers/md/raid1.c Signed-off-by: Zhang Wensheng zhangwensheng5@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/md/raid1.c | 7 +++++++ drivers/md/raid1.h | 1 + 2 files changed, 8 insertions(+)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 9b1b38ad8d27..7c01f8487427 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -300,6 +300,8 @@ static void call_bio_endio(struct r1bio *r1_bio) if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) bio->bi_status = BLK_STS_IOERR;
+ if (blk_queue_io_stat(bio->bi_disk->queue)) + bio_end_io_acct(bio, r1_bio->start_time); bio_endio(bio); }
@@ -1292,6 +1294,9 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
r1_bio->read_disk = rdisk;
+ if (!r1bio_existed && blk_queue_io_stat(bio->bi_disk->queue)) + r1_bio->start_time = bio_start_io_acct(bio); + read_bio = bio_clone_fast(bio, gfp, &mddev->bio_set);
r1_bio->bios[rdisk] = read_bio; @@ -1481,6 +1486,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, r1_bio->sectors = max_sectors; }
+ if (blk_queue_io_stat(bio->bi_disk->queue)) + r1_bio->start_time = bio_start_io_acct(bio); atomic_set(&r1_bio->remaining, 1); atomic_set(&r1_bio->behind_remaining, 0);
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index b7eb09e8c025..ccf10e59b116 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -158,6 +158,7 @@ struct r1bio { sector_t sector; int sectors; unsigned long state; + unsigned long start_time; struct mddev *mddev; /* * original bio going to /dev/mdx
From: Guoqing Jiang jgq516@gmail.com
mainline inclusion from mainline-v5.14-rc1 commit 528bc2cf2fccef2c2c17263f9932094bf81fee5a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I587H6 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
-------------------------------
For raid10, we record the start time between split bio and clone bio, and finish the accounting in the final endio.
Also introduce start_time in r10bio accordingly.
Signed-off-by: Guoqing Jiang jiangguoqing@kylinos.cn Signed-off-by: Song Liu song@kernel.org Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/md/raid10.c | 6 ++++++ drivers/md/raid10.h | 1 + 2 files changed, 7 insertions(+)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 70dccc3c9631..1bc6cb48467f 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -297,6 +297,8 @@ static void raid_end_bio_io(struct r10bio *r10_bio) if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) bio->bi_status = BLK_STS_IOERR;
+ if (blk_queue_io_stat(bio->bi_disk->queue)) + bio_end_io_acct(bio, r10_bio->start_time); bio_endio(bio); /* * Wake up any possible resync thread that waits for the device @@ -1185,6 +1187,8 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, } slot = r10_bio->read_slot;
+ if (blk_queue_io_stat(bio->bi_disk->queue)) + r10_bio->start_time = bio_start_io_acct(bio); read_bio = bio_clone_fast(bio, gfp, &mddev->bio_set);
r10_bio->devs[slot].bio = read_bio; @@ -1468,6 +1472,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, r10_bio->master_bio = bio; }
+ if (blk_queue_io_stat(bio->bi_disk->queue)) + r10_bio->start_time = bio_start_io_acct(bio); atomic_set(&r10_bio->remaining, 1); md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 79cd2b7d3128..5420250d4bd6 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -124,6 +124,7 @@ struct r10bio { sector_t sector; /* virtual sector number */ int sectors; unsigned long state; + unsigned long start_time; struct mddev *mddev; /* * original bio going to /dev/mdx
From: Christoph Hellwig hch@lst.de
mainline inclusion from mainline-v5.12-rc1 commit e82ed3a4fbb54b2d7dcb2a7733520f3e10b97abf category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I587H6 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------
Refactor raid5_read_one_chunk so that all simple checks are done before allocating the bio.
Signed-off-by: Christoph Hellwig hch@lst.de Acked-by: Song Liu song@kernel.org Reviewed-by: Johannes Thumshirn johannes.thumshirn@wdc.com Reviewed-by: Chaitanya Kulkarni chaitanya.kulkarni@wdc.com Acked-by: Damien Le Moal damien.lemoal@wdc.com Signed-off-by: Jens Axboe axboe@kernel.dk Conflict: drivers/md/raid5.c Signed-off-by: Zhang Wensheng zhangwensheng5@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/md/raid5.c | 122 +++++++++++++++++++-------------------------- 1 file changed, 52 insertions(+), 70 deletions(-)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7103f3e330a7..b4bd66d58ace 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5398,97 +5398,79 @@ static void raid5_align_endio(struct bio *bi) static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) { struct r5conf *conf = mddev->private; - int dd_idx; - struct bio* align_bi; + struct bio *align_bio; struct md_rdev *rdev; - sector_t end_sector; + sector_t sector, end_sector, first_bad; + int bad_sectors, dd_idx; struct md_io_acct *md_io_acct;
if (!in_chunk_boundary(mddev, raid_bio)) { pr_debug("%s: non aligned\n", __func__); return 0; } - /* - * use bio_clone_fast to make a copy of the bio - */ - align_bi = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->io_acct_set); - if (!align_bi) - return 0; - md_io_acct = container_of(align_bi, struct md_io_acct, bio_clone); - raid_bio->bi_next = (void *)rdev; - if (blk_queue_io_stat(raid_bio->bi_disk->queue)) - md_io_acct->start_time = bio_start_io_acct(raid_bio); - md_io_acct->orig_bio = raid_bio; - /* - * set bi_end_io to a new function, and set bi_private to the - * original bio. - */ - align_bi->bi_end_io = raid5_align_endio; - align_bi->bi_private = md_io_acct; - /* - * compute position - */ - align_bi->bi_iter.bi_sector = - raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, - 0, &dd_idx, NULL);
- end_sector = bio_end_sector(align_bi); + sector = raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, 0, + &dd_idx, NULL); + end_sector = bio_end_sector(raid_bio); + rcu_read_lock(); + if (r5c_big_stripe_cached(conf, sector)) + goto out_rcu_unlock; + rdev = rcu_dereference(conf->disks[dd_idx].replacement); if (!rdev || test_bit(Faulty, &rdev->flags) || rdev->recovery_offset < end_sector) { rdev = rcu_dereference(conf->disks[dd_idx].rdev); - if (rdev && - (test_bit(Faulty, &rdev->flags) || - !(test_bit(In_sync, &rdev->flags) || - rdev->recovery_offset >= end_sector))) - rdev = NULL; + if (!rdev) + goto out_rcu_unlock; + if (test_bit(Faulty, &rdev->flags) || + !(test_bit(In_sync, &rdev->flags) || + rdev->recovery_offset >= end_sector)) + goto out_rcu_unlock; }
- if (r5c_big_stripe_cached(conf, align_bi->bi_iter.bi_sector)) { - rcu_read_unlock(); - bio_put(align_bi); + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + + align_bio = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->bio_set); + align_bio = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->io_acct_set); + md_io_acct = container_of(align_bio, struct md_io_acct, bio_clone); + raid_bio->bi_next = (void *)rdev; + if (blk_queue_io_stat(raid_bio->bi_disk->queue)) + md_io_acct->start_time = bio_start_io_acct(raid_bio); + md_io_acct->orig_bio = raid_bio; + + bio_set_dev(align_bio, rdev->bdev); + align_bio->bi_end_io = raid5_align_endio; + align_bio->bi_private = md_io_acct; + align_bio->bi_iter.bi_sector = sector; + + if (is_badblock(rdev, sector, bio_sectors(align_bio), &first_bad, + &bad_sectors)) { + bio_put(align_bio); + rdev_dec_pending(rdev, mddev); return 0; }
- if (rdev) { - sector_t first_bad; - int bad_sectors; + /* No reshape active, so we can trust rdev->data_offset */ + align_bio->bi_iter.bi_sector += rdev->data_offset;
- atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - raid_bio->bi_next = (void*)rdev; - bio_set_dev(align_bi, rdev->bdev); - - if (is_badblock(rdev, align_bi->bi_iter.bi_sector, - bio_sectors(align_bi), - &first_bad, &bad_sectors)) { - bio_put(align_bi); - rdev_dec_pending(rdev, mddev); - return 0; - } - - /* No reshape active, so we can trust rdev->data_offset */ - align_bi->bi_iter.bi_sector += rdev->data_offset; + spin_lock_irq(&conf->device_lock); + wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0, + conf->device_lock); + atomic_inc(&conf->active_aligned_reads); + spin_unlock_irq(&conf->device_lock);
- spin_lock_irq(&conf->device_lock); - wait_event_lock_irq(conf->wait_for_quiescent, - conf->quiesce == 0, - conf->device_lock); - atomic_inc(&conf->active_aligned_reads); - spin_unlock_irq(&conf->device_lock); + if (mddev->gendisk) + trace_block_bio_remap(align_bio->bi_disk->queue, + align_bio, disk_devt(mddev->gendisk), + raid_bio->bi_iter.bi_sector); + submit_bio_noacct(align_bio); + return 1;
- if (mddev->gendisk) - trace_block_bio_remap(align_bi->bi_disk->queue, - align_bi, disk_devt(mddev->gendisk), - raid_bio->bi_iter.bi_sector); - submit_bio_noacct(align_bi); - return 1; - } else { - rcu_read_unlock(); - bio_put(align_bi); - return 0; - } +out_rcu_unlock: + rcu_read_unlock(); + return 0; }
static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
From: Jan Kara jack@suse.cz
hulk inclusion category: bugfix bugzilla: 186975, https://gitee.com/openeuler/kernel/issues/I5HT6F CVE: NA
Reference: https://patchwork.ozlabs.org/project/linux-ext4/list/?series=309169
--------------------------------
Do not reclaim entries that are currently used by somebody from a shrinker. Firstly, these entries are likely useful. Secondly, we will need to keep such entries to protect pending increment of xattr block refcount.
CC: stable@vger.kernel.org Fixes: 82939d7999df ("ext4: convert to mbcache2") Signed-off-by: Jan Kara jack@suse.cz Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/mbcache.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/fs/mbcache.c b/fs/mbcache.c index 97c54d3a2227..cfc28129fb6f 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -288,7 +288,7 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache, while (nr_to_scan-- && !list_empty(&cache->c_list)) { entry = list_first_entry(&cache->c_list, struct mb_cache_entry, e_list); - if (entry->e_referenced) { + if (entry->e_referenced || atomic_read(&entry->e_refcnt) > 2) { entry->e_referenced = 0; list_move_tail(&entry->e_list, &cache->c_list); continue; @@ -302,6 +302,14 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache, spin_unlock(&cache->c_list_lock); head = mb_cache_entry_head(cache, entry->e_key); hlist_bl_lock(head); + /* Now a reliable check if the entry didn't get used... */ + if (atomic_read(&entry->e_refcnt) > 2) { + hlist_bl_unlock(head); + spin_lock(&cache->c_list_lock); + list_add_tail(&entry->e_list, &cache->c_list); + cache->c_entry_count++; + continue; + } if (!hlist_bl_unhashed(&entry->e_hash_list)) { hlist_bl_del_init(&entry->e_hash_list); atomic_dec(&entry->e_refcnt);
From: Jan Kara jack@suse.cz
hulk inclusion category: bugfix bugzilla: 186975, https://gitee.com/openeuler/kernel/issues/I5HT6F CVE: NA
Reference: https://patchwork.ozlabs.org/project/linux-ext4/list/?series=309169
--------------------------------
Add function mb_cache_entry_delete_or_get() to delete mbcache entry if it is unused and also add a function to wait for entry to become unused - mb_cache_entry_wait_unused(). We do not share code between the two deleting function as one of them will go away soon.
CC: stable@vger.kernel.org Fixes: 82939d7999df ("ext4: convert to mbcache2") Signed-off-by: Jan Kara jack@suse.cz Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/mbcache.c | 66 +++++++++++++++++++++++++++++++++++++++-- include/linux/mbcache.h | 10 ++++++- 2 files changed, 73 insertions(+), 3 deletions(-)
diff --git a/fs/mbcache.c b/fs/mbcache.c index cfc28129fb6f..2010bc80a3f2 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -11,7 +11,7 @@ /* * Mbcache is a simple key-value store. Keys need not be unique, however * key-value pairs are expected to be unique (we use this fact in - * mb_cache_entry_delete()). + * mb_cache_entry_delete_or_get()). * * Ext2 and ext4 use this cache for deduplication of extended attribute blocks. * Ext4 also uses it for deduplication of xattr values stored in inodes. @@ -125,6 +125,19 @@ void __mb_cache_entry_free(struct mb_cache_entry *entry) } EXPORT_SYMBOL(__mb_cache_entry_free);
+/* + * mb_cache_entry_wait_unused - wait to be the last user of the entry + * + * @entry - entry to work on + * + * Wait to be the last user of the entry. + */ +void mb_cache_entry_wait_unused(struct mb_cache_entry *entry) +{ + wait_var_event(&entry->e_refcnt, atomic_read(&entry->e_refcnt) <= 3); +} +EXPORT_SYMBOL(mb_cache_entry_wait_unused); + static struct mb_cache_entry *__entry_find(struct mb_cache *cache, struct mb_cache_entry *entry, u32 key) @@ -217,7 +230,7 @@ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key, } EXPORT_SYMBOL(mb_cache_entry_get);
-/* mb_cache_entry_delete - remove a cache entry +/* mb_cache_entry_delete - try to remove a cache entry * @cache - cache we work with * @key - key * @value - value @@ -254,6 +267,55 @@ void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value) } EXPORT_SYMBOL(mb_cache_entry_delete);
+/* mb_cache_entry_delete_or_get - remove a cache entry if it has no users + * @cache - cache we work with + * @key - key + * @value - value + * + * Remove entry from cache @cache with key @key and value @value. The removal + * happens only if the entry is unused. The function returns NULL in case the + * entry was successfully removed or there's no entry in cache. Otherwise the + * function grabs reference of the entry that we failed to delete because it + * still has users and return it. + */ +struct mb_cache_entry *mb_cache_entry_delete_or_get(struct mb_cache *cache, + u32 key, u64 value) +{ + struct hlist_bl_node *node; + struct hlist_bl_head *head; + struct mb_cache_entry *entry; + + head = mb_cache_entry_head(cache, key); + hlist_bl_lock(head); + hlist_bl_for_each_entry(entry, node, head, e_hash_list) { + if (entry->e_key == key && entry->e_value == value) { + if (atomic_read(&entry->e_refcnt) > 2) { + atomic_inc(&entry->e_refcnt); + hlist_bl_unlock(head); + return entry; + } + /* We keep hash list reference to keep entry alive */ + hlist_bl_del_init(&entry->e_hash_list); + hlist_bl_unlock(head); + spin_lock(&cache->c_list_lock); + if (!list_empty(&entry->e_list)) { + list_del_init(&entry->e_list); + if (!WARN_ONCE(cache->c_entry_count == 0, + "mbcache: attempt to decrement c_entry_count past zero")) + cache->c_entry_count--; + atomic_dec(&entry->e_refcnt); + } + spin_unlock(&cache->c_list_lock); + mb_cache_entry_put(cache, entry); + return NULL; + } + } + hlist_bl_unlock(head); + + return NULL; +} +EXPORT_SYMBOL(mb_cache_entry_delete_or_get); + /* mb_cache_entry_touch - cache entry got used * @cache - cache the entry belongs to * @entry - entry that got used diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h index 20f1e3ff6013..8eca7f25c432 100644 --- a/include/linux/mbcache.h +++ b/include/linux/mbcache.h @@ -30,15 +30,23 @@ void mb_cache_destroy(struct mb_cache *cache); int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, u64 value, bool reusable); void __mb_cache_entry_free(struct mb_cache_entry *entry); +void mb_cache_entry_wait_unused(struct mb_cache_entry *entry); static inline int mb_cache_entry_put(struct mb_cache *cache, struct mb_cache_entry *entry) { - if (!atomic_dec_and_test(&entry->e_refcnt)) + unsigned int cnt = atomic_dec_return(&entry->e_refcnt); + + if (cnt > 0) { + if (cnt <= 3) + wake_up_var(&entry->e_refcnt); return 0; + } __mb_cache_entry_free(entry); return 1; }
+struct mb_cache_entry *mb_cache_entry_delete_or_get(struct mb_cache *cache, + u32 key, u64 value); void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value); struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key, u64 value);
From: Jan Kara jack@suse.cz
hulk inclusion category: bugfix bugzilla: 186975, https://gitee.com/openeuler/kernel/issues/I5HT6F CVE: NA
Reference: https://patchwork.ozlabs.org/project/linux-ext4/list/?series=309169
--------------------------------
Currently we remove EA inode from mbcache as soon as its xattr refcount drops to zero. However there can be pending attempts to reuse the inode and thus refcount handling code has to handle the situation when refcount increases from zero anyway. So save some work and just keep EA inode in mbcache until it is getting evicted. At that moment we are sure following iget() of EA inode will fail anyway (or wait for eviction to finish and load things from the disk again) and so removing mbcache entry at that moment is fine and simplifies the code a bit.
CC: stable@vger.kernel.org Fixes: 82939d7999df ("ext4: convert to mbcache2") Signed-off-by: Jan Kara jack@suse.cz Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/ext4/inode.c | 2 ++ fs/ext4/xattr.c | 24 ++++++++---------------- fs/ext4/xattr.h | 1 + 3 files changed, 11 insertions(+), 16 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e2f1afcde5cf..e868b33ed8f5 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -179,6 +179,8 @@ void ext4_evict_inode(struct inode *inode)
trace_ext4_evict_inode(inode);
+ if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL) + ext4_evict_ea_inode(inode); if (inode->i_nlink) { /* * When journalling data dirty buffers are tracked only in the diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index b5016eb7b373..5f84c5dce0bb 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -436,6 +436,14 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, return err; }
+/* Remove entry from mbcache when EA inode is getting evicted */ +void ext4_evict_ea_inode(struct inode *inode) +{ + if (EA_INODE_CACHE(inode)) + mb_cache_entry_delete(EA_INODE_CACHE(inode), + ext4_xattr_inode_get_hash(inode), inode->i_ino); +} + static int ext4_xattr_inode_verify_hashes(struct inode *ea_inode, struct ext4_xattr_entry *entry, void *buffer, @@ -975,10 +983,8 @@ int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode, static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, int ref_change) { - struct mb_cache *ea_inode_cache = EA_INODE_CACHE(ea_inode); struct ext4_iloc iloc; s64 ref_count; - u32 hash; int ret;
inode_lock(ea_inode); @@ -1001,14 +1007,6 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
set_nlink(ea_inode, 1); ext4_orphan_del(handle, ea_inode); - - if (ea_inode_cache) { - hash = ext4_xattr_inode_get_hash(ea_inode); - mb_cache_entry_create(ea_inode_cache, - GFP_NOFS, hash, - ea_inode->i_ino, - true /* reusable */); - } } } else { WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld", @@ -1021,12 +1019,6 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
clear_nlink(ea_inode); ext4_orphan_add(handle, ea_inode); - - if (ea_inode_cache) { - hash = ext4_xattr_inode_get_hash(ea_inode); - mb_cache_entry_delete(ea_inode_cache, hash, - ea_inode->i_ino); - } } }
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 87e5863bb493..b357872ab83b 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -191,6 +191,7 @@ extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array);
extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle); +extern void ext4_evict_ea_inode(struct inode *inode);
extern const struct xattr_handler *ext4_xattr_handlers[];
From: Jan Kara jack@suse.cz
hulk inclusion category: bugfix bugzilla: 186975, https://gitee.com/openeuler/kernel/issues/I5HT6F CVE: NA
Reference: https://patchwork.ozlabs.org/project/linux-ext4/list/?series=309169
--------------------------------
Remove unnecessary else (and thus indentation level) from a code block in ext4_xattr_block_set(). It will also make following code changes easier. No functional changes.
CC: stable@vger.kernel.org Fixes: 82939d7999df ("ext4: convert to mbcache2") Signed-off-by: Jan Kara jack@suse.cz Conflicts: fs/ext4/xattr.c [ 188c299e2a26 ("ext4: Support for checksumming from journal triggers") is not applied. ] Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/ext4/xattr.c | 79 ++++++++++++++++++++++++------------------------- 1 file changed, 39 insertions(+), 40 deletions(-)
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 5f84c5dce0bb..2e83dcdc042f 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1846,6 +1846,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, #define header(x) ((struct ext4_xattr_header *)(x))
if (s->base) { + int offset = (char *)s->here - bs->bh->b_data; + BUFFER_TRACE(bs->bh, "get_write_access"); error = ext4_journal_get_write_access(handle, bs->bh); if (error) @@ -1877,50 +1879,47 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, if (error) goto cleanup; goto inserted; - } else { - int offset = (char *)s->here - bs->bh->b_data; + } + unlock_buffer(bs->bh); + ea_bdebug(bs->bh, "cloning"); + s->base = kmalloc(bs->bh->b_size, GFP_NOFS); + error = -ENOMEM; + if (s->base == NULL) + goto cleanup; + memcpy(s->base, BHDR(bs->bh), bs->bh->b_size); + s->first = ENTRY(header(s->base)+1); + header(s->base)->h_refcount = cpu_to_le32(1); + s->here = ENTRY(s->base + offset); + s->end = s->base + bs->bh->b_size;
- unlock_buffer(bs->bh); - ea_bdebug(bs->bh, "cloning"); - s->base = kmalloc(bs->bh->b_size, GFP_NOFS); - error = -ENOMEM; - if (s->base == NULL) + /* + * If existing entry points to an xattr inode, we need + * to prevent ext4_xattr_set_entry() from decrementing + * ref count on it because the reference belongs to the + * original block. In this case, make the entry look + * like it has an empty value. + */ + if (!s->not_found && s->here->e_value_inum) { + ea_ino = le32_to_cpu(s->here->e_value_inum); + error = ext4_xattr_inode_iget(inode, ea_ino, + le32_to_cpu(s->here->e_hash), + &tmp_inode); + if (error) goto cleanup; - memcpy(s->base, BHDR(bs->bh), bs->bh->b_size); - s->first = ENTRY(header(s->base)+1); - header(s->base)->h_refcount = cpu_to_le32(1); - s->here = ENTRY(s->base + offset); - s->end = s->base + bs->bh->b_size;
- /* - * If existing entry points to an xattr inode, we need - * to prevent ext4_xattr_set_entry() from decrementing - * ref count on it because the reference belongs to the - * original block. In this case, make the entry look - * like it has an empty value. - */ - if (!s->not_found && s->here->e_value_inum) { - ea_ino = le32_to_cpu(s->here->e_value_inum); - error = ext4_xattr_inode_iget(inode, ea_ino, - le32_to_cpu(s->here->e_hash), - &tmp_inode); - if (error) - goto cleanup; - - if (!ext4_test_inode_state(tmp_inode, - EXT4_STATE_LUSTRE_EA_INODE)) { - /* - * Defer quota free call for previous - * inode until success is guaranteed. - */ - old_ea_inode_quota = le32_to_cpu( - s->here->e_value_size); - } - iput(tmp_inode); - - s->here->e_value_inum = 0; - s->here->e_value_size = 0; + if (!ext4_test_inode_state(tmp_inode, + EXT4_STATE_LUSTRE_EA_INODE)) { + /* + * Defer quota free call for previous + * inode until success is guaranteed. + */ + old_ea_inode_quota = le32_to_cpu( + s->here->e_value_size); } + iput(tmp_inode); + + s->here->e_value_inum = 0; + s->here->e_value_size = 0; } } else { /* Allocate a buffer where we construct the new block. */
From: Jan Kara jack@suse.cz
hulk inclusion category: bugfix bugzilla: 186975, https://gitee.com/openeuler/kernel/issues/I5HT6F CVE: NA
Reference: https://patchwork.ozlabs.org/project/linux-ext4/list/?series=309169
--------------------------------
When ext4_xattr_block_set() decides to remove xattr block the following race can happen:
CPU1 CPU2 ext4_xattr_block_set() ext4_xattr_release_block() new_bh = ext4_xattr_block_cache_find()
lock_buffer(bh); ref = le32_to_cpu(BHDR(bh)->h_refcount); if (ref == 1) { ... mb_cache_entry_delete(); unlock_buffer(bh); ext4_free_blocks(); ... ext4_forget(..., bh, ...); jbd2_journal_revoke(..., bh);
ext4_journal_get_write_access(..., new_bh, ...) do_get_write_access() jbd2_journal_cancel_revoke(..., new_bh);
Later the code in ext4_xattr_block_set() finds out the block got freed and cancels reusal of the block but the revoke stays canceled and so in case of block reuse and journal replay the filesystem can get corrupted. If the race works out slightly differently, we can also hit assertions in the jbd2 code.
Fix the problem by making sure that once matching mbcache entry is found, code dropping the last xattr block reference (or trying to modify xattr block in place) waits until the mbcache entry reference is dropped. This way code trying to reuse xattr block is protected from someone trying to drop the last reference to xattr block.
Reported-and-tested-by: Ritesh Harjani ritesh.list@gmail.com CC: stable@vger.kernel.org Fixes: 82939d7999df ("ext4: convert to mbcache2") Signed-off-by: Jan Kara jack@suse.cz Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/ext4/xattr.c | 67 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 22 deletions(-)
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 2e83dcdc042f..da7610ef1fa9 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -439,9 +439,16 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, /* Remove entry from mbcache when EA inode is getting evicted */ void ext4_evict_ea_inode(struct inode *inode) { - if (EA_INODE_CACHE(inode)) - mb_cache_entry_delete(EA_INODE_CACHE(inode), - ext4_xattr_inode_get_hash(inode), inode->i_ino); + struct mb_cache_entry *oe; + + if (!EA_INODE_CACHE(inode)) + return; + /* Wait for entry to get unused so that we can remove it */ + while ((oe = mb_cache_entry_delete_or_get(EA_INODE_CACHE(inode), + ext4_xattr_inode_get_hash(inode), inode->i_ino))) { + mb_cache_entry_wait_unused(oe); + mb_cache_entry_put(EA_INODE_CACHE(inode), oe); + } }
static int @@ -1226,6 +1233,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, if (error) goto out;
+retry_ref: lock_buffer(bh); hash = le32_to_cpu(BHDR(bh)->h_hash); ref = le32_to_cpu(BHDR(bh)->h_refcount); @@ -1235,9 +1243,18 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, * This must happen under buffer lock for * ext4_xattr_block_set() to reliably detect freed block */ - if (ea_block_cache) - mb_cache_entry_delete(ea_block_cache, hash, - bh->b_blocknr); + if (ea_block_cache) { + struct mb_cache_entry *oe; + + oe = mb_cache_entry_delete_or_get(ea_block_cache, hash, + bh->b_blocknr); + if (oe) { + unlock_buffer(bh); + mb_cache_entry_wait_unused(oe); + mb_cache_entry_put(ea_block_cache, oe); + goto retry_ref; + } + } get_bh(bh); unlock_buffer(bh);
@@ -1862,9 +1879,20 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, * ext4_xattr_block_set() to reliably detect modified * block */ - if (ea_block_cache) - mb_cache_entry_delete(ea_block_cache, hash, - bs->bh->b_blocknr); + if (ea_block_cache) { + struct mb_cache_entry *oe; + + oe = mb_cache_entry_delete_or_get(ea_block_cache, + hash, bs->bh->b_blocknr); + if (oe) { + /* + * Xattr block is getting reused. Leave + * it alone. + */ + mb_cache_entry_put(ea_block_cache, oe); + goto clone_block; + } + } ea_bdebug(bs->bh, "modifying in-place"); error = ext4_xattr_set_entry(i, s, handle, inode, true /* is_block */); @@ -1880,6 +1908,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, goto cleanup; goto inserted; } +clone_block: unlock_buffer(bs->bh); ea_bdebug(bs->bh, "cloning"); s->base = kmalloc(bs->bh->b_size, GFP_NOFS); @@ -1986,18 +2015,13 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, lock_buffer(new_bh); /* * We have to be careful about races with - * freeing, rehashing or adding references to - * xattr block. Once we hold buffer lock xattr - * block's state is stable so we can check - * whether the block got freed / rehashed or - * not. Since we unhash mbcache entry under - * buffer lock when freeing / rehashing xattr - * block, checking whether entry is still - * hashed is reliable. Same rules hold for - * e_reusable handling. + * adding references to xattr block. Once we + * hold buffer lock xattr block's state is + * stable so we can check the additional + * reference fits. */ - if (hlist_bl_unhashed(&ce->e_hash_list) || - !ce->e_reusable) { + ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1; + if (ref > EXT4_XATTR_REFCOUNT_MAX) { /* * Undo everything and check mbcache * again. @@ -2012,9 +2036,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, new_bh = NULL; goto inserted; } - ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1; BHDR(new_bh)->h_refcount = cpu_to_le32(ref); - if (ref >= EXT4_XATTR_REFCOUNT_MAX) + if (ref == EXT4_XATTR_REFCOUNT_MAX) ce->e_reusable = 0; ea_bdebug(new_bh, "reusing; refcount now=%d", ref);
From: Haoyue Xu xuhaoyue1@hisilicon.com
mainline inclusion from mainline-for-next commit f5c25465b4f7d3badcaa5bf4a6f82f5763865b19 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5IZO5 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=f5c...
----------------------------------------------------------------------
The HNS NIC driver receives and handles the abnormal interrupt of the RAS type generated by ROCEE, and the HNS RDMA driver does not need to handle this type of interrupt. Therefore, delete unused codes in the HNS RDMA driver.
Link: https://lore.kernel.org/r/20220714134353.16700-2-liangwenpeng@huawei.com Signed-off-by: Haoyue Xu xuhaoyue1@hisilicon.com Signed-off-by: Wenpeng Liang liangwenpeng@huawei.com Signed-off-by: Leon Romanovsky leon@kernel.org Signed-off-by: Zhengfeng Luo luozhengfeng@h-partners.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 10 ---------- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 1 - 2 files changed, 11 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index b5ed2aee578b..06b9f2ed73ce 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -5659,16 +5659,6 @@ static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id) int_en |= 1 << HNS_ROCE_V2_VF_ABN_INT_EN_S; roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en);
- int_work = 1; - } else if (int_st & BIT(HNS_ROCE_V2_VF_INT_ST_RAS_INT_S)) { - dev_err(dev, "RAS interrupt!\n"); - - int_st |= 1 << HNS_ROCE_V2_VF_INT_ST_RAS_INT_S; - roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, int_st); - - int_en |= 1 << HNS_ROCE_V2_VF_ABN_INT_EN_S; - roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en); - int_work = 1; } else { dev_err(dev, "There is no abnormal irq found!\n"); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index a3a2524a5e25..b30d2ebdbe59 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -1354,7 +1354,6 @@ struct hns_roce_dip { #define HNS_ROCE_V2_ASYNC_EQE_NUM 0x1000
#define HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S 0 -#define HNS_ROCE_V2_VF_INT_ST_RAS_INT_S 1
#define HNS_ROCE_EQ_DB_CMD_AEQ 0x0 #define HNS_ROCE_EQ_DB_CMD_AEQ_ARMED 0x1
From: Haoyue Xu xuhaoyue1@hisilicon.com
mainline inclusion from mainline-for-next commit d95e0a0c6c9602ff6bb90c1c20987b204493d8e1 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5IZO5 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=d95...
----------------------------------------------------------------------
The type of return value of the interrupt handler should be irqreturn_t.
Link: https://lore.kernel.org/r/20220714134353.16700-3-liangwenpeng@huawei.com Signed-off-by: Haoyue Xu xuhaoyue1@hisilicon.com Signed-off-by: Wenpeng Liang liangwenpeng@huawei.com Signed-off-by: Leon Romanovsky leon@kernel.org Signed-off-by: Zhengfeng Luo luozhengfeng@h-partners.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 27 +++++++++++----------- 1 file changed, 14 insertions(+), 13 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 06b9f2ed73ce..bb9de18725bc 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -5501,12 +5501,12 @@ static struct hns_roce_aeqe *next_aeqe_sw_v2(struct hns_roce_eq *eq) !!(eq->cons_index & eq->entries)) ? aeqe : NULL; }
-static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, - struct hns_roce_eq *eq) +static irqreturn_t hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, + struct hns_roce_eq *eq) { struct device *dev = hr_dev->dev; struct hns_roce_aeqe *aeqe = next_aeqe_sw_v2(eq); - int aeqe_found = 0; + irqreturn_t aeqe_found = IRQ_NONE; int event_type; u32 queue_num; int sub_type; @@ -5560,7 +5560,7 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, eq->event_type = event_type; eq->sub_type = sub_type; ++eq->cons_index; - aeqe_found = 1; + aeqe_found = IRQ_HANDLED;
hns_roce_v2_init_irq_work(hr_dev, eq, queue_num);
@@ -5568,7 +5568,8 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, }
update_eq_db(eq); - return aeqe_found; + + return IRQ_RETVAL(aeqe_found); }
static struct hns_roce_ceqe *next_ceqe_sw_v2(struct hns_roce_eq *eq) @@ -5583,11 +5584,11 @@ static struct hns_roce_ceqe *next_ceqe_sw_v2(struct hns_roce_eq *eq) !!(eq->cons_index & eq->entries)) ? ceqe : NULL; }
-static int hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev, - struct hns_roce_eq *eq) +static irqreturn_t hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev, + struct hns_roce_eq *eq) { struct hns_roce_ceqe *ceqe = next_ceqe_sw_v2(eq); - int ceqe_found = 0; + irqreturn_t ceqe_found = IRQ_NONE; u32 cqn;
while (ceqe) { @@ -5601,21 +5602,21 @@ static int hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev, hns_roce_cq_completion(hr_dev, cqn);
++eq->cons_index; - ceqe_found = 1; + ceqe_found = IRQ_HANDLED;
ceqe = next_ceqe_sw_v2(eq); }
update_eq_db(eq);
- return ceqe_found; + return IRQ_RETVAL(ceqe_found); }
static irqreturn_t hns_roce_v2_msix_interrupt_eq(int irq, void *eq_ptr) { struct hns_roce_eq *eq = eq_ptr; struct hns_roce_dev *hr_dev = eq->hr_dev; - int int_work; + irqreturn_t int_work;
if (eq->type_flag == HNS_ROCE_CEQ) /* Completion event interrupt */ @@ -5631,7 +5632,7 @@ static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id) { struct hns_roce_dev *hr_dev = dev_id; struct device *dev = hr_dev->dev; - int int_work = 0; + irqreturn_t int_work = IRQ_NONE; u32 int_st; u32 int_en;
@@ -5659,7 +5660,7 @@ static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id) int_en |= 1 << HNS_ROCE_V2_VF_ABN_INT_EN_S; roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en);
- int_work = 1; + int_work = IRQ_HANDLED; } else { dev_err(dev, "There is no abnormal irq found!\n"); }
From: Haoyue Xu xuhaoyue1@hisilicon.com
mainline inclusion from mainline-for-next commit ecb4db5c3590aa956b4b2c352081a5b632d1f9f9 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5IZO5 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=ecb...
----------------------------------------------------------------------
The driver will clear all the interrupts in the same area when the driver handles the interrupt of type AEQ overflow. It should only set the interrupt status bit of type AEQ overflow.
Fixes: a5073d6054f7 ("RDMA/hns: Add eq support of hip08") Link: https://lore.kernel.org/r/20220714134353.16700-4-liangwenpeng@huawei.com Signed-off-by: Haoyue Xu xuhaoyue1@hisilicon.com Signed-off-by: Wenpeng Liang liangwenpeng@huawei.com Signed-off-by: Leon Romanovsky leon@kernel.org Signed-off-by: Zhengfeng Luo luozhengfeng@h-partners.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index bb9de18725bc..27d226ed9396 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -5647,8 +5647,8 @@ static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id)
dev_err(dev, "AEQ overflow!\n");
- int_st |= 1 << HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S; - roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, int_st); + roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, + 1 << HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S);
/* Set reset level for reset_event() */ if (ops->set_default_reset_request)
From: Haoyue Xu xuhaoyue1@hisilicon.com
mainline inclusion from mainline-for-next commit 75e4e716f7089558fda4ddc660fa8dbdec4eb1d3 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5IZO5 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=75e...
----------------------------------------------------------------------
Use a single function to handle the same kind of abnormal interrupts.
Link: https://lore.kernel.org/r/20220714134353.16700-5-liangwenpeng@huawei.com Signed-off-by: Haoyue Xu xuhaoyue1@hisilicon.com Signed-off-by: Wenpeng Liang liangwenpeng@huawei.com Signed-off-by: Leon Romanovsky leon@kernel.org Signed-off-by: Zhengfeng Luo luozhengfeng@h-partners.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 35 ++++++++++++++-------- 1 file changed, 23 insertions(+), 12 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 27d226ed9396..8e8a5f69ea09 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -5628,24 +5628,19 @@ static irqreturn_t hns_roce_v2_msix_interrupt_eq(int irq, void *eq_ptr) return IRQ_RETVAL(int_work); }
-static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id) +static irqreturn_t abnormal_interrupt_basic(struct hns_roce_dev *hr_dev, + u32 int_st) { - struct hns_roce_dev *hr_dev = dev_id; - struct device *dev = hr_dev->dev; + struct pci_dev *pdev = hr_dev->pci_dev; + struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); + const struct hnae3_ae_ops *ops = ae_dev->ops; irqreturn_t int_work = IRQ_NONE; - u32 int_st; u32 int_en;
- /* Abnormal interrupt */ - int_st = roce_read(hr_dev, ROCEE_VF_ABN_INT_ST_REG); int_en = roce_read(hr_dev, ROCEE_VF_ABN_INT_EN_REG);
if (int_st & BIT(HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S)) { - struct pci_dev *pdev = hr_dev->pci_dev; - struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); - const struct hnae3_ae_ops *ops = ae_dev->ops; - - dev_err(dev, "AEQ overflow!\n"); + dev_err(hr_dev->dev, "AEQ overflow!\n");
roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, 1 << HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S); @@ -5662,12 +5657,28 @@ static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id)
int_work = IRQ_HANDLED; } else { - dev_err(dev, "There is no abnormal irq found!\n"); + dev_err(hr_dev->dev, "there is no basic abn irq found.\n"); }
return IRQ_RETVAL(int_work); }
+static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id) +{ + struct hns_roce_dev *hr_dev = dev_id; + irqreturn_t int_work = IRQ_NONE; + u32 int_st; + + int_st = roce_read(hr_dev, ROCEE_VF_ABN_INT_ST_REG); + + if (int_st) + int_work = abnormal_interrupt_basic(hr_dev, int_st); + else + dev_err(hr_dev->dev, "there is no abnormal irq found.\n"); + + return IRQ_RETVAL(int_work); +} + static void hns_roce_v2_int_mask_enable(struct hns_roce_dev *hr_dev, int eq_num, u32 enable_flag) {
From: Haoyue Xu xuhaoyue1@hisilicon.com
mainline inclusion from mainline-for-next commit 2de949abd6a539fac4b2c89a560e4ae505b6fb52 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5IZO5 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=2de...
----------------------------------------------------------------------
Since ECC memory maintains a memory system immune to single-bit errors, add support for correcting the 1bit-ECC error, which prevents a 1bit-ECC error become an uncorrected type error. When a 1bit-ECC error happens in the internal ram of the ROCE engine, such as the QPC table, as a 1bit-ECC error caused by reading, the ROCE engine only corrects those 1bit ECC errors by writing.
Link: https://lore.kernel.org/r/20220714134353.16700-6-liangwenpeng@huawei.com Signed-off-by: Haoyue Xu xuhaoyue1@hisilicon.com Signed-off-by: Wenpeng Liang liangwenpeng@huawei.com Signed-off-by: Leon Romanovsky leon@kernel.org Signed-off-by: Zhengfeng Luo luozhengfeng@h-partners.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_device.h | 1 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 182 +++++++++++++++++++- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 13 ++ 3 files changed, 194 insertions(+), 2 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 0d160432fa65..f600475dd1fc 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -957,6 +957,7 @@ struct hns_roce_dev { const struct hns_roce_hw *hw; void *priv; struct workqueue_struct *irq_workq; + struct work_struct ecc_work; const struct hns_roce_dfx_hw *dfx; u32 func_num; u32 is_vf; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 8e8a5f69ea09..045dde54974d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -55,6 +55,42 @@ enum { CMD_RST_PRC_EBUSY, };
+enum ecc_resource_type { + ECC_RESOURCE_QPC, + ECC_RESOURCE_CQC, + ECC_RESOURCE_MPT, + ECC_RESOURCE_SRQC, + ECC_RESOURCE_GMV, + ECC_RESOURCE_QPC_TIMER, + ECC_RESOURCE_CQC_TIMER, + ECC_RESOURCE_SCCC, + ECC_RESOURCE_COUNT, +}; + +static const struct { + const char *name; + u8 read_bt0_op; + u8 write_bt0_op; +} fmea_ram_res[] = { + { "ECC_RESOURCE_QPC", + HNS_ROCE_CMD_READ_QPC_BT0, HNS_ROCE_CMD_WRITE_QPC_BT0 }, + { "ECC_RESOURCE_CQC", + HNS_ROCE_CMD_READ_CQC_BT0, HNS_ROCE_CMD_WRITE_CQC_BT0 }, + { "ECC_RESOURCE_MPT", + HNS_ROCE_CMD_READ_MPT_BT0, HNS_ROCE_CMD_WRITE_MPT_BT0 }, + { "ECC_RESOURCE_SRQC", + HNS_ROCE_CMD_READ_SRQC_BT0, HNS_ROCE_CMD_WRITE_SRQC_BT0 }, + /* ECC_RESOURCE_GMV is handled by cmdq, not mailbox */ + { "ECC_RESOURCE_GMV", + 0, 0 }, + { "ECC_RESOURCE_QPC_TIMER", + HNS_ROCE_CMD_READ_QPC_TIMER_BT0, HNS_ROCE_CMD_WRITE_QPC_TIMER_BT0 }, + { "ECC_RESOURCE_CQC_TIMER", + HNS_ROCE_CMD_READ_CQC_TIMER_BT0, HNS_ROCE_CMD_WRITE_CQC_TIMER_BT0 }, + { "ECC_RESOURCE_SCCC", + HNS_ROCE_CMD_READ_SCCC_BT0, HNS_ROCE_CMD_WRITE_SCCC_BT0 }, +}; + static inline void set_data_seg_v2(struct hns_roce_v2_wqe_data_seg *dseg, struct ib_sge *sg) { @@ -5663,6 +5699,142 @@ static irqreturn_t abnormal_interrupt_basic(struct hns_roce_dev *hr_dev, return IRQ_RETVAL(int_work); }
+static int fmea_ram_ecc_query(struct hns_roce_dev *hr_dev, + struct fmea_ram_ecc *ecc_info) +{ + struct hns_roce_cmq_desc desc; + struct hns_roce_cmq_req *req = (struct hns_roce_cmq_req *)desc.data; + int ret; + + hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_QUERY_RAM_ECC, true); + ret = hns_roce_cmq_send(hr_dev, &desc, 1); + if (ret) + return ret; + + ecc_info->is_ecc_err = hr_reg_read(req, QUERY_RAM_ECC_1BIT_ERR); + ecc_info->res_type = hr_reg_read(req, QUERY_RAM_ECC_RES_TYPE); + ecc_info->index = hr_reg_read(req, QUERY_RAM_ECC_TAG); + + return 0; +} + +static int fmea_recover_gmv(struct hns_roce_dev *hr_dev, u32 idx) +{ + struct hns_roce_cmq_desc desc; + struct hns_roce_cmq_req *req = (struct hns_roce_cmq_req *)desc.data; + u32 addr_upper; + u32 addr_low; + int ret; + + hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_GMV_BT, true); + hr_reg_write(req, CFG_GMV_BT_IDX, idx); + + ret = hns_roce_cmq_send(hr_dev, &desc, 1); + if (ret) { + dev_err(hr_dev->dev, + "failed to execute cmd to read gmv, ret = %d.\n", ret); + return ret; + } + + addr_low = hr_reg_read(req, CFG_GMV_BT_BA_L); + addr_upper = hr_reg_read(req, CFG_GMV_BT_BA_H); + + hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_GMV_BT, false); + hr_reg_write(req, CFG_GMV_BT_BA_L, addr_low); + hr_reg_write(req, CFG_GMV_BT_BA_H, addr_upper); + hr_reg_write(req, CFG_GMV_BT_IDX, idx); + + return hns_roce_cmq_send(hr_dev, &desc, 1); +} + +static u64 fmea_get_ram_res_addr(u32 res_type, __le64 *data) +{ + if (res_type == ECC_RESOURCE_QPC_TIMER || + res_type == ECC_RESOURCE_CQC_TIMER || + res_type == ECC_RESOURCE_SCCC) + return le64_to_cpu(*data); + + return le64_to_cpu(*data) << PAGE_SHIFT; +} + +static int fmea_recover_others(struct hns_roce_dev *hr_dev, u32 res_type, + u32 index) +{ + u8 write_bt0_op = fmea_ram_res[res_type].write_bt0_op; + u8 read_bt0_op = fmea_ram_res[res_type].read_bt0_op; + struct hns_roce_cmd_mailbox *mailbox; + u64 addr; + int ret; + + mailbox = hns_roce_alloc_cmd_mailbox(hr_dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, read_bt0_op, index); + if (ret) { + dev_err(hr_dev->dev, + "failed to execute cmd to read fmea ram, ret = %d.\n", + ret); + goto out; + } + + addr = fmea_get_ram_res_addr(res_type, mailbox->buf); + + ret = hns_roce_cmd_mbox(hr_dev, addr, 0, write_bt0_op, index); + if (ret) + dev_err(hr_dev->dev, + "failed to execute cmd to write fmea ram, ret = %d.\n", + ret); + +out: + hns_roce_free_cmd_mailbox(hr_dev, mailbox); + return ret; +} + +static void fmea_ram_ecc_recover(struct hns_roce_dev *hr_dev, + struct fmea_ram_ecc *ecc_info) +{ + u32 res_type = ecc_info->res_type; + u32 index = ecc_info->index; + int ret; + + BUILD_BUG_ON(ARRAY_SIZE(fmea_ram_res) != ECC_RESOURCE_COUNT); + + if (res_type >= ECC_RESOURCE_COUNT) { + dev_err(hr_dev->dev, "unsupported fmea ram ecc type %u.\n", + res_type); + return; + } + + if (res_type == ECC_RESOURCE_GMV) + ret = fmea_recover_gmv(hr_dev, index); + else + ret = fmea_recover_others(hr_dev, res_type, index); + if (ret) + dev_err(hr_dev->dev, + "failed to recover %s, index = %u, ret = %d.\n", + fmea_ram_res[res_type].name, index, ret); +} + +static void fmea_ram_ecc_work(struct work_struct *ecc_work) +{ + struct hns_roce_dev *hr_dev = + container_of(ecc_work, struct hns_roce_dev, ecc_work); + struct fmea_ram_ecc ecc_info = {}; + + if (fmea_ram_ecc_query(hr_dev, &ecc_info)) { + dev_err(hr_dev->dev, "failed to query fmea ram ecc.\n"); + return; + } + + if (!ecc_info.is_ecc_err) { + dev_err(hr_dev->dev, "there is no fmea ram ecc err found.\n"); + return; + } + + fmea_ram_ecc_recover(hr_dev, &ecc_info); +} + static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id) { struct hns_roce_dev *hr_dev = dev_id; @@ -5671,10 +5843,14 @@ static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id)
int_st = roce_read(hr_dev, ROCEE_VF_ABN_INT_ST_REG);
- if (int_st) + if (int_st) { int_work = abnormal_interrupt_basic(hr_dev, int_st); - else + } else if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) { + queue_work(hr_dev->irq_workq, &hr_dev->ecc_work); + int_work = IRQ_HANDLED; + } else { dev_err(hr_dev->dev, "there is no abnormal irq found.\n"); + }
return IRQ_RETVAL(int_work); } @@ -5981,6 +6157,8 @@ static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev) } }
+ INIT_WORK(&hr_dev->ecc_work, fmea_ram_ecc_work); + hr_dev->irq_workq = alloc_ordered_workqueue("hns_roce_irq_workq", 0); if (!hr_dev->irq_workq) { dev_err(dev, "failed to create irq workqueue.\n"); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index b30d2ebdbe59..e5df163f56e0 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -238,6 +238,7 @@ enum hns_roce_opcode_type { HNS_ROCE_OPC_CFG_GMV_TBL = 0x850f, HNS_ROCE_OPC_CFG_GMV_BT = 0x8510, HNS_ROCE_OPC_EXT_CFG = 0x8512, + HNS_ROCE_QUERY_RAM_ECC = 0x8513, HNS_SWITCH_PARAMETER_CFG = 0x1033, };
@@ -1095,6 +1096,11 @@ enum { #define CFG_GMV_BT_BA_H CMQ_REQ_FIELD_LOC(51, 32) #define CFG_GMV_BT_IDX CMQ_REQ_FIELD_LOC(95, 64)
+/* Fields of HNS_ROCE_QUERY_RAM_ECC */ +#define QUERY_RAM_ECC_1BIT_ERR CMQ_REQ_FIELD_LOC(31, 0) +#define QUERY_RAM_ECC_RES_TYPE CMQ_REQ_FIELD_LOC(63, 32) +#define QUERY_RAM_ECC_TAG CMQ_REQ_FIELD_LOC(95, 64) + struct hns_roce_cfg_sgid_tb { __le32 table_idx_rsv; __le32 vf_sgid_l; @@ -1323,6 +1329,13 @@ struct hns_roce_dip { struct list_head node; /* all dips are on a list */ };
+struct fmea_ram_ecc { + u32 is_ecc_err; + u32 res_type; + u32 index; +}; + + #define HNS_ROCE_AEQ_DEFAULT_BURST_NUM 0x0 #define HNS_ROCE_AEQ_DEFAULT_INTERVAL 0x0 #define HNS_ROCE_CEQ_DEFAULT_BURST_NUM 0x0
From: Zhihao Cheng chengzhihao1@huawei.com
hulk inclusion category: bugfix bugzilla: 187185, https://gitee.com/openeuler/kernel/issues/I5JAOC CVE: NA
--------------------------------
Following process will fail assertion 'jh->b_frozen_data == NULL' in jbd2_journal_dirty_metadata():
jbd2_journal_commit_transaction unlink(dir/a) jh->b_transaction = trans1 jh->b_jlist = BJ_Metadata journal->j_running_transaction = NULL trans1->t_state = T_COMMIT unlink(dir/b) handle->h_trans = trans2 do_get_write_access jh->b_modified = 0 jh->b_frozen_data = frozen_buffer jh->b_next_transaction = trans2 jbd2_journal_dirty_metadata is_handle_aborted is_journal_aborted // return false
--> jbd2 abort <--
while (commit_transaction->t_buffers) if (is_journal_aborted) jbd2_journal_refile_buffer __jbd2_journal_refile_buffer WRITE_ONCE(jh->b_transaction, jh->b_next_transaction) WRITE_ONCE(jh->b_next_transaction, NULL) __jbd2_journal_file_buffer(jh, BJ_Reserved) J_ASSERT_JH(jh, jh->b_frozen_data == NULL) // assertion failure !
The reproducer (See detail in [Link]) reports: ------------[ cut here ]------------ kernel BUG at fs/jbd2/transaction.c:1629! invalid opcode: 0000 [#1] PREEMPT SMP CPU: 2 PID: 584 Comm: unlink Tainted: G W 5.19.0-rc6-00115-g4a57a8400075-dirty #697 RIP: 0010:jbd2_journal_dirty_metadata+0x3c5/0x470 RSP: 0018:ffffc90000be7ce0 EFLAGS: 00010202 Call Trace: <TASK> __ext4_handle_dirty_metadata+0xa0/0x290 ext4_handle_dirty_dirblock+0x10c/0x1d0 ext4_delete_entry+0x104/0x200 __ext4_unlink+0x22b/0x360 ext4_unlink+0x275/0x390 vfs_unlink+0x20b/0x4c0 do_unlinkat+0x42f/0x4c0 __x64_sys_unlink+0x37/0x50 do_syscall_64+0x35/0x80
After journal aborting, __jbd2_journal_refile_buffer() is executed with holding @jh->b_state_lock, we can fix it by moving 'is_handle_aborted()' into the area protected by @jh->b_state_lock.
Link: https://bugzilla.kernel.org/show_bug.cgi?id=216251 Fixes: 470decc613ab20 ("[PATCH] jbd2: initial copy of files from jbd") Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/jbd2/transaction.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 8804e126805f..cefee2dead54 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1460,8 +1460,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) struct journal_head *jh; int ret = 0;
- if (is_handle_aborted(handle)) - return -EROFS; if (!buffer_jbd(bh)) return -EUCLEAN;
@@ -1508,6 +1506,18 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) journal = transaction->t_journal; spin_lock(&jh->b_state_lock);
+ if (is_handle_aborted(handle)) { + /* + * Check journal aborting with @jh->b_state_lock locked, + * since 'jh->b_transaction' could be replaced with + * 'jh->b_next_transaction' during old transaction + * committing if journal aborted, which may fail + * assertion on 'jh->b_frozen_data == NULL'. + */ + ret = -EROFS; + goto out_unlock_bh; + } + if (jh->b_modified == 0) { /* * This buffer's got modified and becoming part
From: Pavel Skripkin paskripkin@gmail.com
mainline inclusion from mainline-v5.19-rc8 commit 0ac4827f78c7ffe8eef074bc010e7e34bc22f533 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I58DFB CVE: CVE-2022-1679
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Syzbot reported use-after-free Read in ath9k_hif_usb_rx_cb() [0]. The problem was in incorrect htc_handle->drv_priv initialization.
Probable call trace which can trigger use-after-free:
ath9k_htc_probe_device() /* htc_handle->drv_priv = priv; */ ath9k_htc_wait_for_target() <--- Failed ieee80211_free_hw() <--- priv pointer is freed
<IRQ> ... ath9k_hif_usb_rx_cb() ath9k_hif_usb_rx_stream() RX_STAT_INC() <--- htc_handle->drv_priv access
In order to not add fancy protection for drv_priv we can move htc_handle->drv_priv initialization at the end of the ath9k_htc_probe_device() and add helper macro to make all *_STAT_* macros NULL safe, since syzbot has reported related NULL deref in that macros [1]
Link: https://syzkaller.appspot.com/bug?id=6ead44e37afb6866ac0c7dd121b4ce07cb665f6... [0] Link: https://syzkaller.appspot.com/bug?id=b8101ffcec107c0567a0cd8acbbacec91e9ee8d... [1] Fixes: fb9987d0f748 ("ath9k_htc: Support for AR9271 chipset.") Reported-and-tested-by: syzbot+03110230a11411024147@syzkaller.appspotmail.com Reported-and-tested-by: syzbot+c6dde1f690b60e0b9fbe@syzkaller.appspotmail.com Signed-off-by: Pavel Skripkin paskripkin@gmail.com Acked-by: Toke Høiland-Jørgensen toke@toke.dk Signed-off-by: Kalle Valo quic_kvalo@quicinc.com Link: https://lore.kernel.org/r/d57bbedc857950659bfacac0ab48790c1eda00c8.165514574... Signed-off-by: Xu Jia xujia39@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/wireless/ath/ath9k/htc.h | 10 +++++----- drivers/net/wireless/ath/ath9k/htc_drv_init.c | 3 ++- 2 files changed, 7 insertions(+), 6 deletions(-)
diff --git a/drivers/net/wireless/ath/ath9k/htc.h b/drivers/net/wireless/ath/ath9k/htc.h index 0a1634238e67..b25670f9d859 100644 --- a/drivers/net/wireless/ath/ath9k/htc.h +++ b/drivers/net/wireless/ath/ath9k/htc.h @@ -325,11 +325,11 @@ static inline struct ath9k_htc_tx_ctl *HTC_SKB_CB(struct sk_buff *skb) }
#ifdef CONFIG_ATH9K_HTC_DEBUGFS - -#define TX_STAT_INC(c) (hif_dev->htc_handle->drv_priv->debug.tx_stats.c++) -#define TX_STAT_ADD(c, a) (hif_dev->htc_handle->drv_priv->debug.tx_stats.c += a) -#define RX_STAT_INC(c) (hif_dev->htc_handle->drv_priv->debug.skbrx_stats.c++) -#define RX_STAT_ADD(c, a) (hif_dev->htc_handle->drv_priv->debug.skbrx_stats.c += a) +#define __STAT_SAFE(expr) (hif_dev->htc_handle->drv_priv ? (expr) : 0) +#define TX_STAT_INC(c) __STAT_SAFE(hif_dev->htc_handle->drv_priv->debug.tx_stats.c++) +#define TX_STAT_ADD(c, a) __STAT_SAFE(hif_dev->htc_handle->drv_priv->debug.tx_stats.c += a) +#define RX_STAT_INC(c) __STAT_SAFE(hif_dev->htc_handle->drv_priv->debug.skbrx_stats.c++) +#define RX_STAT_ADD(c, a) __STAT_SAFE(hif_dev->htc_handle->drv_priv->debug.skbrx_stats.c += a) #define CAB_STAT_INC priv->debug.tx_stats.cab_queued++
#define TX_QSTAT_INC(q) (priv->debug.tx_stats.queue_stats[q]++) diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_init.c b/drivers/net/wireless/ath/ath9k/htc_drv_init.c index ff61ae34ecdf..07ac88fb1c57 100644 --- a/drivers/net/wireless/ath/ath9k/htc_drv_init.c +++ b/drivers/net/wireless/ath/ath9k/htc_drv_init.c @@ -944,7 +944,6 @@ int ath9k_htc_probe_device(struct htc_target *htc_handle, struct device *dev, priv->hw = hw; priv->htc = htc_handle; priv->dev = dev; - htc_handle->drv_priv = priv; SET_IEEE80211_DEV(hw, priv->dev);
ret = ath9k_htc_wait_for_target(priv); @@ -965,6 +964,8 @@ int ath9k_htc_probe_device(struct htc_target *htc_handle, struct device *dev, if (ret) goto err_init;
+ htc_handle->drv_priv = priv; + return 0;
err_init:
From: Hangyu Hua hbh25y@gmail.com
stable inclusion from stable-v5.10.134 commit 47b696dd654450cdec3103a833e5bf29c4b83bfa category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5JMHA CVE: CVE-2022-36879
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=l...
--------------------------------
[ Upstream commit f85daf0e725358be78dfd208dea5fd665d8cb901 ]
xfrm_policy_lookup() will call xfrm_pol_hold_rcu() to get a refcount of pols[0]. This refcount can be dropped in xfrm_expand_policies() when xfrm_expand_policies() return error. pols[0]'s refcount is balanced in here. But xfrm_bundle_lookup() will also call xfrm_pols_put() with num_pols == 1 to drop this refcount when xfrm_expand_policies() return error.
This patch also fix an illegal address access. pols[0] will save a error point when xfrm_policy_lookup fails. This lead to xfrm_pols_put to resolve an illegal address in xfrm_bundle_lookup's error path.
Fix these by setting num_pols = 0 in xfrm_expand_policies()'s error path.
Fixes: 80c802f3073e ("xfrm: cache bundles instead of policies for outgoing flows") Signed-off-by: Hangyu Hua hbh25y@gmail.com Signed-off-by: Steffen Klassert steffen.klassert@secunet.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Zhengchao Shao shaozhengchao@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/xfrm/xfrm_policy.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 3d0ffd927004..01a783fccf76 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2680,8 +2680,10 @@ static int xfrm_expand_policies(const struct flowi *fl, u16 family, *num_xfrms = 0; return 0; } - if (IS_ERR(pols[0])) + if (IS_ERR(pols[0])) { + *num_pols = 0; return PTR_ERR(pols[0]); + }
*num_xfrms = pols[0]->xfrm_nr;
@@ -2696,6 +2698,7 @@ static int xfrm_expand_policies(const struct flowi *fl, u16 family, if (pols[1]) { if (IS_ERR(pols[1])) { xfrm_pols_put(pols, *num_pols); + *num_pols = 0; return PTR_ERR(pols[1]); } (*num_pols)++;
From: Ziyang Xuan william.xuanziyang@huawei.com
mainline inclusion from mainline-v5.19 commit 85f0173df35e5462d89947135a6a5599c6c3ef6f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5JW9C?from=project-issue CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
-------------------------------
Change net device's MTU to smaller than IPV6_MIN_MTU or unregister device while matching route. That may trigger null-ptr-deref bug for ip6_ptr probability as following.
========================================================= BUG: KASAN: null-ptr-deref in find_match.part.0+0x70/0x134 Read of size 4 at addr 0000000000000308 by task ping6/263
CPU: 2 PID: 263 Comm: ping6 Not tainted 5.19.0-rc7+ #14 Call trace: dump_backtrace+0x1a8/0x230 show_stack+0x20/0x70 dump_stack_lvl+0x68/0x84 print_report+0xc4/0x120 kasan_report+0x84/0x120 __asan_load4+0x94/0xd0 find_match.part.0+0x70/0x134 __find_rr_leaf+0x408/0x470 fib6_table_lookup+0x264/0x540 ip6_pol_route+0xf4/0x260 ip6_pol_route_output+0x58/0x70 fib6_rule_lookup+0x1a8/0x330 ip6_route_output_flags_noref+0xd8/0x1a0 ip6_route_output_flags+0x58/0x160 ip6_dst_lookup_tail+0x5b4/0x85c ip6_dst_lookup_flow+0x98/0x120 rawv6_sendmsg+0x49c/0xc70 inet_sendmsg+0x68/0x94
Reproducer as following: Firstly, prepare conditions: $ip netns add ns1 $ip netns add ns2 $ip link add veth1 type veth peer name veth2 $ip link set veth1 netns ns1 $ip link set veth2 netns ns2 $ip netns exec ns1 ip -6 addr add 2001:0db8:0:f101::1/64 dev veth1 $ip netns exec ns2 ip -6 addr add 2001:0db8:0:f101::2/64 dev veth2 $ip netns exec ns1 ifconfig veth1 up $ip netns exec ns2 ifconfig veth2 up $ip netns exec ns1 ip -6 route add 2000::/64 dev veth1 metric 1 $ip netns exec ns2 ip -6 route add 2001::/64 dev veth2 metric 1
Secondly, execute the following two commands in two ssh windows respectively: $ip netns exec ns1 sh $while true; do ip -6 addr add 2001:0db8:0:f101::1/64 dev veth1; ip -6 route add 2000::/64 dev veth1 metric 1; ping6 2000::2; done
$ip netns exec ns1 sh $while true; do ip link set veth1 mtu 1000; ip link set veth1 mtu 1500; sleep 5; done
It is because ip6_ptr has been assigned to NULL in addrconf_ifdown() firstly, then ip6_ignore_linkdown() accesses ip6_ptr directly without NULL check.
cpu0 cpu1 fib6_table_lookup __find_rr_leaf addrconf_notify [ NETDEV_CHANGEMTU ] addrconf_ifdown RCU_INIT_POINTER(dev->ip6_ptr, NULL) find_match ip6_ignore_linkdown
So we can add NULL check for ip6_ptr before using in ip6_ignore_linkdown() to fix the null-ptr-deref bug.
Fixes: dcd1f572954f ("net/ipv6: Remove fib6_idev") Signed-off-by: Ziyang Xuan william.xuanziyang@huawei.com Reviewed-by: David Ahern dsahern@kernel.org Link: https://lore.kernel.org/r/20220728013307.656257-1-william.xuanziyang@huawei.... Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Ziyang Xuan william.xuanziyang@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/net/addrconf.h | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/include/net/addrconf.h b/include/net/addrconf.h index e7ce719838b5..edba74a53683 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -405,6 +405,9 @@ static inline bool ip6_ignore_linkdown(const struct net_device *dev) { const struct inet6_dev *idev = __in6_dev_get(dev);
+ if (unlikely(!idev)) + return true; + return !!idev->cnf.ignore_routes_with_linkdown; }