[openEuler-5.10 11/19] md/raid1: fix a race between removing rdev and access conf->mirrors[i].rdev

23 Dec 2021

From: Yufen Yu <yuyufen@huawei.com>

hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I4JYYO?from=project-issue
CVE: NA

---------------------------

We get a NULL pointer dereference oops when test raid1 as follow:

mdadm -CR /dev/md1 -l 1 -n 2 /dev/sd[ab]

mdadm /dev/md1 -f /dev/sda
mdadm /dev/md1 -r /dev/sda
mdadm /dev/md1 -a /dev/sda
sleep 5
mdadm /dev/md1 -f /dev/sdb
mdadm /dev/md1 -r /dev/sdb
mdadm /dev/md1 -a /dev/sdb

After a disk(/dev/sda) has been removed, we add the disk to
raid array again, which would trigger recovery action.
Since the rdev current state is 'spare', read/write bio can
be issued to the disk.

Then we set the other disk (/dev/sdb) faulty. Since the raid
array is now in degraded state and /dev/sdb is the only
'In_sync' disk, raid1_error() will return but without set
faulty success.

However, that can interrupt the recovery action and
md_check_recovery will try to call remove_and_add_spares()
to remove the spare disk. And the race condition between
remove_and_add_spares() and raid1_write_request() in follow
can cause NULL pointer dereference for conf->mirrors[i].rdev:

raid1_write_request()   md_check_recovery    raid1_error()
rcu_read_lock()
rdev != NULL
!test_bit(Faulty, &rdev->flags)

                                           conf->recovery_disabled=
                                             mddev->recovery_disabled;
                                            return busy

                        remove_and_add_spares
                        raid1_remove_disk
                        rdev->nr_pending == 0

atomic_inc(&rdev->nr_pending);
rcu_read_unlock()

                        p->rdev=NULL

conf->mirrors[i].rdev->data_offset
NULL pointer deref!!!

                        if (!test_bit(RemoveSynchronized,
                          &rdev->flags))
                         synchronize_rcu();
                         p->rdev=rdev

To fix the race condition, we add a new flag 'WantRemove' for rdev.
Before access conf->mirrors[i].rdev, we need to ensure the rdev
without 'WantRemove' bit.

Link: https://marc.info/?l=linux-raid&m=156412052717709&w=2

Reported-by: Zou Wei <zou_wei@huawei.com>
Signed-off-by: Yufen Yu <yuyufen@huawei.com>
Confilct:
        drivers/md/md.h
Signed-off-by: Laibin Qiu <qiulaibin@huawei.com>
Reviewed-by: yuyufen <yuyufen@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/md/md.h    |  4 ++++
 drivers/md/raid1.c | 28 ++++++++++++++++++++++------
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/drivers/md/md.h b/drivers/md/md.h
index c94811cf2600..766ecfb0ff5c 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -213,6 +213,10 @@ enum flag_bits {
 				 * check if there is collision between raid1
 				 * serial bios.
 				 */
+	WantRemove,		/* Before set conf->mirrors[i] as NULL,
+				 * we set the bit first, avoiding access the
+				 * conf->mirrors[i] after it set NULL.
+				 */
 };
 
 static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index fb31e5dd54a6..da6772f49f07 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -641,7 +641,8 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 		rdev = rcu_dereference(conf->mirrors[disk].rdev);
 		if (r1_bio->bios[disk] == IO_BLOCKED
 		    || rdev == NULL
-		    || test_bit(Faulty, &rdev->flags))
+		    || test_bit(Faulty, &rdev->flags)
+			|| test_bit(WantRemove, &rdev->flags))
 			continue;
 		if (!test_bit(In_sync, &rdev->flags) &&
 		    rdev->recovery_offset < this_sector + sectors)
@@ -770,7 +771,8 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 
 	if (best_disk >= 0) {
 		rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
-		if (!rdev)
+		if (!rdev || test_bit(Faulty, &rdev->flags)
+				|| test_bit(WantRemove, &rdev->flags))
 			goto retry;
 		atomic_inc(&rdev->nr_pending);
 		sectors = best_good_sectors;
@@ -1382,7 +1384,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 			break;
 		}
 		r1_bio->bios[i] = NULL;
-		if (!rdev || test_bit(Faulty, &rdev->flags)) {
+		if (!rdev || test_bit(Faulty, &rdev->flags)
+				|| test_bit(WantRemove, &rdev->flags)) {
 			if (i < conf->raid_disks)
 				set_bit(R1BIO_Degraded, &r1_bio->state);
 			continue;
@@ -1759,6 +1762,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 
 			p->head_position = 0;
 			rdev->raid_disk = mirror;
+			clear_bit(WantRemove, &rdev->flags);
 			err = 0;
 			/* As all devices are equivalent, we don't need a full recovery
 			 * if this was recently any drive of the array
@@ -1773,6 +1777,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 			/* Add this device as a replacement */
 			clear_bit(In_sync, &rdev->flags);
 			set_bit(Replacement, &rdev->flags);
+			clear_bit(WantRemove, &rdev->flags);
 			rdev->raid_disk = mirror;
 			err = 0;
 			conf->fullsync = 1;
@@ -1812,16 +1817,26 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 			err = -EBUSY;
 			goto abort;
 		}
-		p->rdev = NULL;
+
+		/*
+		 * Before set p->rdev = NULL, we set WantRemove bit avoiding
+		 * race between rdev remove and issue bio, which can cause
+		 * NULL pointer deference of rdev by conf->mirrors[i].rdev.
+		 */
+		set_bit(WantRemove, &rdev->flags);
+
 		if (!test_bit(RemoveSynchronized, &rdev->flags)) {
 			synchronize_rcu();
 			if (atomic_read(&rdev->nr_pending)) {
 				/* lost the race, try later */
 				err = -EBUSY;
-				p->rdev = rdev;
+				clear_bit(WantRemove, &rdev->flags);
 				goto abort;
 			}
 		}
+
+		p->rdev = NULL;
+
 		if (conf->mirrors[conf->raid_disks + number].rdev) {
 			/* We just removed a device that is being replaced.
 			 * Move down the replacement.  We drain all IO before
@@ -2716,7 +2731,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 
 		rdev = rcu_dereference(conf->mirrors[i].rdev);
 		if (rdev == NULL ||
-		    test_bit(Faulty, &rdev->flags)) {
+		    test_bit(Faulty, &rdev->flags) ||
+			test_bit(WantRemove, &rdev->flags)) {
 			if (i < conf->raid_disks)
 				still_degraded = 1;
 		} else if (!test_bit(In_sync, &rdev->flags)) {
-- 
2.20.1