From: Yufen Yu yuyufen@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4JYYO?from=project-issue CVE: NA
---------------------------
We get a NULL pointer dereference oops when test raid1 as follow:
mdadm -CR /dev/md1 -l 1 -n 2 /dev/sd[ab]
mdadm /dev/md1 -f /dev/sda mdadm /dev/md1 -r /dev/sda mdadm /dev/md1 -a /dev/sda sleep 5 mdadm /dev/md1 -f /dev/sdb mdadm /dev/md1 -r /dev/sdb mdadm /dev/md1 -a /dev/sdb
After a disk(/dev/sda) has been removed, we add the disk to raid array again, which would trigger recovery action. Since the rdev current state is 'spare', read/write bio can be issued to the disk.
Then we set the other disk (/dev/sdb) faulty. Since the raid array is now in degraded state and /dev/sdb is the only 'In_sync' disk, raid1_error() will return but without set faulty success.
However, that can interrupt the recovery action and md_check_recovery will try to call remove_and_add_spares() to remove the spare disk. And the race condition between remove_and_add_spares() and raid1_write_request() in follow can cause NULL pointer dereference for conf->mirrors[i].rdev:
raid1_write_request() md_check_recovery raid1_error() rcu_read_lock() rdev != NULL !test_bit(Faulty, &rdev->flags)
conf->recovery_disabled= mddev->recovery_disabled; return busy
remove_and_add_spares raid1_remove_disk rdev->nr_pending == 0
atomic_inc(&rdev->nr_pending); rcu_read_unlock()
p->rdev=NULL
conf->mirrors[i].rdev->data_offset NULL pointer deref!!!
if (!test_bit(RemoveSynchronized, &rdev->flags)) synchronize_rcu(); p->rdev=rdev
To fix the race condition, we add a new flag 'WantRemove' for rdev. Before access conf->mirrors[i].rdev, we need to ensure the rdev without 'WantRemove' bit.
Link: https://marc.info/?l=linux-raid&m=156412052717709&w=2
Reported-by: Zou Wei zou_wei@huawei.com Signed-off-by: Yufen Yu yuyufen@huawei.com Confilct: drivers/md/md.h Signed-off-by: Laibin Qiu qiulaibin@huawei.com Reviewed-by: yuyufen yuyufen@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/md/md.h | 4 ++++ drivers/md/raid1.c | 28 ++++++++++++++++++++++------ 2 files changed, 26 insertions(+), 6 deletions(-)
diff --git a/drivers/md/md.h b/drivers/md/md.h index c94811cf2600..766ecfb0ff5c 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -213,6 +213,10 @@ enum flag_bits { * check if there is collision between raid1 * serial bios. */ + WantRemove, /* Before set conf->mirrors[i] as NULL, + * we set the bit first, avoiding access the + * conf->mirrors[i] after it set NULL. + */ };
static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index fb31e5dd54a6..da6772f49f07 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -641,7 +641,8 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect rdev = rcu_dereference(conf->mirrors[disk].rdev); if (r1_bio->bios[disk] == IO_BLOCKED || rdev == NULL - || test_bit(Faulty, &rdev->flags)) + || test_bit(Faulty, &rdev->flags) + || test_bit(WantRemove, &rdev->flags)) continue; if (!test_bit(In_sync, &rdev->flags) && rdev->recovery_offset < this_sector + sectors) @@ -770,7 +771,8 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
if (best_disk >= 0) { rdev = rcu_dereference(conf->mirrors[best_disk].rdev); - if (!rdev) + if (!rdev || test_bit(Faulty, &rdev->flags) + || test_bit(WantRemove, &rdev->flags)) goto retry; atomic_inc(&rdev->nr_pending); sectors = best_good_sectors; @@ -1382,7 +1384,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, break; } r1_bio->bios[i] = NULL; - if (!rdev || test_bit(Faulty, &rdev->flags)) { + if (!rdev || test_bit(Faulty, &rdev->flags) + || test_bit(WantRemove, &rdev->flags)) { if (i < conf->raid_disks) set_bit(R1BIO_Degraded, &r1_bio->state); continue; @@ -1759,6 +1762,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
p->head_position = 0; rdev->raid_disk = mirror; + clear_bit(WantRemove, &rdev->flags); err = 0; /* As all devices are equivalent, we don't need a full recovery * if this was recently any drive of the array @@ -1773,6 +1777,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) /* Add this device as a replacement */ clear_bit(In_sync, &rdev->flags); set_bit(Replacement, &rdev->flags); + clear_bit(WantRemove, &rdev->flags); rdev->raid_disk = mirror; err = 0; conf->fullsync = 1; @@ -1812,16 +1817,26 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) err = -EBUSY; goto abort; } - p->rdev = NULL; + + /* + * Before set p->rdev = NULL, we set WantRemove bit avoiding + * race between rdev remove and issue bio, which can cause + * NULL pointer deference of rdev by conf->mirrors[i].rdev. + */ + set_bit(WantRemove, &rdev->flags); + if (!test_bit(RemoveSynchronized, &rdev->flags)) { synchronize_rcu(); if (atomic_read(&rdev->nr_pending)) { /* lost the race, try later */ err = -EBUSY; - p->rdev = rdev; + clear_bit(WantRemove, &rdev->flags); goto abort; } } + + p->rdev = NULL; + if (conf->mirrors[conf->raid_disks + number].rdev) { /* We just removed a device that is being replaced. * Move down the replacement. We drain all IO before @@ -2716,7 +2731,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev == NULL || - test_bit(Faulty, &rdev->flags)) { + test_bit(Faulty, &rdev->flags) || + test_bit(WantRemove, &rdev->flags)) { if (i < conf->raid_disks) still_degraded = 1; } else if (!test_bit(In_sync, &rdev->flags)) {