From: Xiang Chen chenxiang66@hisilicon.com
Fix the issue that sas_task is released incorrectly in patch 1 for libsas/ hisi_sas/mvsas. And use bool for sas_discover_event() return code in patch 2.
Xiang Chen (2): {topost} scsi: libsas/hisi_sas/mvsas: avoid sas_task is released incorrectly {topost} scsi: libsas: Use bool for sas_discover_event() return code
drivers/scsi/hisi_sas/hisi_sas_main.c | 3 ++- drivers/scsi/libsas/sas_discover.c | 6 ++---- drivers/scsi/libsas/sas_expander.c | 3 ++- drivers/scsi/mvsas/mv_sas.c | 3 ++- include/scsi/libsas.h | 2 +- 5 files changed, 9 insertions(+), 8 deletions(-)
From: Xiang Chen chenxiang66@hisilicon.com
For some situations that retries three times for internal IOs, sas_task is already released, but sas_free_task() will still be called again. It is possible that sas_task is released incorrectly as sas_task may be allocated for other IO before releasing it again.
Signed-off-by: Xiang Chen chenxiang66@hisilicon.com --- drivers/scsi/hisi_sas/hisi_sas_main.c | 3 ++- drivers/scsi/libsas/sas_expander.c | 3 ++- drivers/scsi/mvsas/mv_sas.c | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c index 9792b2628933..6c693810622b 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_main.c +++ b/drivers/scsi/hisi_sas/hisi_sas_main.c @@ -1354,7 +1354,8 @@ static int hisi_sas_exec_internal_tmf_task(struct domain_device *device, ex_err: if (retry == TASK_RETRY) dev_warn(dev, "abort tmf: executing internal task failed!\n"); - sas_free_task(task); + else + sas_free_task(task); return res; }
diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c index 6425b9fd99ee..dd1d26e96e16 100644 --- a/drivers/scsi/libsas/sas_expander.c +++ b/drivers/scsi/libsas/sas_expander.c @@ -136,7 +136,8 @@ static int smp_execute_task_sg(struct domain_device *dev, pm_runtime_put_sync(ha->dev);
BUG_ON(retry == 3 && task != NULL); - sas_free_task(task); + if (retry != 3) + sas_free_task(task); return res; }
diff --git a/drivers/scsi/mvsas/mv_sas.c b/drivers/scsi/mvsas/mv_sas.c index 1e52bc7febfa..d7200422ac96 100644 --- a/drivers/scsi/mvsas/mv_sas.c +++ b/drivers/scsi/mvsas/mv_sas.c @@ -1350,7 +1350,8 @@ static int mvs_exec_internal_tmf_task(struct domain_device *dev, } ex_err: BUG_ON(retry == 3 && task != NULL); - sas_free_task(task); + if (retry != 3) + sas_free_task(task); return res; }
On 23/11/2021 02:48, chenxiang wrote:
From: Xiang Chen chenxiang66@hisilicon.com
For some situations that retries three times for internal IOs,
Sorry to say, but I find this hard to read.
sas_task is already released, but sas_free_task() will still be called again.
What are these situations?
Thanks, John
It is possible that sas_task is released incorrectly as sas_task may be allocated for other IO before releasing it again.
Signed-off-by: Xiang Chen chenxiang66@hisilicon.com
drivers/scsi/hisi_sas/hisi_sas_main.c | 3 ++- drivers/scsi/libsas/sas_expander.c | 3 ++- drivers/scsi/mvsas/mv_sas.c | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c index 9792b2628933..6c693810622b 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_main.c +++ b/drivers/scsi/hisi_sas/hisi_sas_main.c @@ -1354,7 +1354,8 @@ static int hisi_sas_exec_internal_tmf_task(struct domain_device *device, ex_err: if (retry == TASK_RETRY) dev_warn(dev, "abort tmf: executing internal task failed!\n");
- sas_free_task(task);
- else
return res; }sas_free_task(task);
diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c index 6425b9fd99ee..dd1d26e96e16 100644 --- a/drivers/scsi/libsas/sas_expander.c +++ b/drivers/scsi/libsas/sas_expander.c @@ -136,7 +136,8 @@ static int smp_execute_task_sg(struct domain_device *dev, pm_runtime_put_sync(ha->dev);
BUG_ON(retry == 3 && task != NULL);
- sas_free_task(task);
- if (retry != 3)
return res; }sas_free_task(task);
diff --git a/drivers/scsi/mvsas/mv_sas.c b/drivers/scsi/mvsas/mv_sas.c index 1e52bc7febfa..d7200422ac96 100644 --- a/drivers/scsi/mvsas/mv_sas.c +++ b/drivers/scsi/mvsas/mv_sas.c @@ -1350,7 +1350,8 @@ static int mvs_exec_internal_tmf_task(struct domain_device *dev, } ex_err: BUG_ON(retry == 3 && task != NULL);
- sas_free_task(task);
- if (retry != 3)
return res; }sas_free_task(task);
在 2021/11/23 19:36, John Garry 写道:
On 23/11/2021 02:48, chenxiang wrote:
From: Xiang Chen chenxiang66@hisilicon.com
For some situations that retries three times for internal IOs,
Sorry to say, but I find this hard to read.
sas_task is already released, but sas_free_task() will still be called again.
What are these situations?
For tmf IO, it is failed with stat = SAS_OPEN_REJECT and resp = SAS_TASK_COMPLETE, then it will retry three times. Eevey time sas_task is allocated at the beginning and freed at the end in the loop. But it will free the sas_task again (outside the loop) at the end of the function. But if freed sas_task is allocated by other IO before freeing sas_task at the end of the function, it frees other IO's sas_task actually which will cause memory issue.
thread 1 thread 2 allocate task0 free task0 allocate task0 as task0 is freed already by thread 1 free task0
Thanks, John
It is possible that sas_task is released incorrectly as sas_task may be allocated for other IO before releasing it again.
Signed-off-by: Xiang Chen chenxiang66@hisilicon.com
drivers/scsi/hisi_sas/hisi_sas_main.c | 3 ++- drivers/scsi/libsas/sas_expander.c | 3 ++- drivers/scsi/mvsas/mv_sas.c | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c index 9792b2628933..6c693810622b 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_main.c +++ b/drivers/scsi/hisi_sas/hisi_sas_main.c @@ -1354,7 +1354,8 @@ static int hisi_sas_exec_internal_tmf_task(struct domain_device *device, ex_err: if (retry == TASK_RETRY) dev_warn(dev, "abort tmf: executing internal task failed!\n");
- sas_free_task(task);
- else
} diff --git a/drivers/scsi/libsas/sas_expander.csas_free_task(task); return res;
b/drivers/scsi/libsas/sas_expander.c index 6425b9fd99ee..dd1d26e96e16 100644 --- a/drivers/scsi/libsas/sas_expander.c +++ b/drivers/scsi/libsas/sas_expander.c @@ -136,7 +136,8 @@ static int smp_execute_task_sg(struct domain_device *dev, pm_runtime_put_sync(ha->dev); BUG_ON(retry == 3 && task != NULL);
- sas_free_task(task);
- if (retry != 3)
} diff --git a/drivers/scsi/mvsas/mv_sas.c b/drivers/scsi/mvsas/mv_sas.csas_free_task(task); return res;
index 1e52bc7febfa..d7200422ac96 100644 --- a/drivers/scsi/mvsas/mv_sas.c +++ b/drivers/scsi/mvsas/mv_sas.c @@ -1350,7 +1350,8 @@ static int mvs_exec_internal_tmf_task(struct domain_device *dev, } ex_err: BUG_ON(retry == 3 && task != NULL);
- sas_free_task(task);
- if (retry != 3)
}sas_free_task(task); return res;
.
On 23/11/2021 12:04, chenxiang (M) wrote:
ry to say, but I find this hard to read.
sas_task is already released, but sas_free_task() will still be called again.
What are these situations?
For tmf IO, it is failed with stat = SAS_OPEN_REJECT and resp = SAS_TASK_COMPLETE, then it will retry three times. Eevey time sas_task is allocated at the beginning and freed at the end in the loop.
I am looking at hisi_sas_main.c, and for this case on each retry we free the sas_task at the bottom of the loop and then set the pointer to NULL. And after we break out from 3 failed retries we do call sas_free_task() again but task = NULL and sas_free_task() is NULL safe.
But it will free the sas_task again (outside the loop) at the end of the function.
Are you sure? As explained above, I think that this is safe.
Thanks, John
But if freed sas_task is allocated by other IO before freeing sas_task at the end of the function, it frees other IO's sas_task actually which will cause memory issue.
thread 1 thread 2 allocate task0 free task0 allocate task0 as task0 is freed already by thread 1 free task0
在 2021/11/25 19:54, John Garry 写道:
On 23/11/2021 12:04, chenxiang (M) wrote:
ry to say, but I find this hard to read.
sas_task is already released, but sas_free_task() will still be called again.
What are these situations?
For tmf IO, it is failed with stat = SAS_OPEN_REJECT and resp = SAS_TASK_COMPLETE, then it will retry three times. Eevey time sas_task is allocated at the beginning and freed at the end in the loop.
I am looking at hisi_sas_main.c, and for this case on each retry we free the sas_task at the bottom of the loop and then set the pointer to NULL. And after we break out from 3 failed retries we do call sas_free_task() again but task = NULL and sas_free_task() is NULL safe.
But it will free the sas_task again (outside the loop) at the end of the function.
Are you sure? As explained above, I think that this is safe.
But between the 3rd sas_free_task and 4rd sas_free_task, it is possible that freed sas_task is allocated for other IO. Tmf IOs from different disks are asynchronized when sending ata reset (you can see following logs)
[35133.522944] sas: Enter sas_scsi_recover_host busy: 0 failed: 0 [35133.538102] sas: ata58: end_device-2:0:0: dev error handler [35133.589002] sas: ata59: end_device-2:0:1: dev error handler [35133.633502] sas: ata60: end_device-2:0:2: dev error handler [35133.688151] sas: ata61: end_device-2:0:5: dev error handler [35135.898451] ata58.00: ATA-10: ST4000NM0035-1V4107, TN03, max UDMA/133 [35135.973229] ata59.00: ATA-10: ST4000NM0035-1V4107, TN03, max UDMA/133 [35135.986567] ata60.00: ATA-10: ST4000NM0035-1V4107, TN03, max UDMA/133 [35136.002164] ata61.00: ATA-10: ST4000NM0035-1V4107, TN03, max UDMA/133 [35151.121408] hisi_sas_v3_hw 0000:30:04.0: erroneous completion iptt=4076 task=00000000a6e52fa3 dev id=1 exp 0x500e004aaaaaaa1f phy0 addr=500e004aaaaaaa00 CQ hdr: 0x101b 0x10fec 0x0 0x0 Error info: 0x8000 0x0 0x0 0x0 [35151.149873] hisi_sas_v3_hw 0000:30:04.0: abort tmf: open reject failed [35151.268958] hisi_sas_v3_hw 0000:30:04.0: erroneous completion iptt=4077 task=0000000025465ccf dev id=1 exp 0x500e004aaaaaaa1f phy0 addr=500e004aaaaaaa00 CQ hdr: 0x101b 0x10fed 0x0 0x0 Error info: 0x8000 0x0 0x0 0x0 [35151.293686] hisi_sas_v3_hw 0000:30:04.0: abort tmf: open reject failed [35151.380960] hisi_sas_v3_hw 0000:30:04.0: erroneous completion iptt=4089 task=00000000b7661372 dev id=2 exp 0x500e004aaaaaaa1f phy1 addr=500e004aaaaaaa01 CQ hdr: 0x101b 0x20ff9 0x0 0x0 Error info: 0x8000 0x0 0x0 0x0 [35151.397819] hisi_sas_v3_hw 0000:30:04.0: erroneous completion iptt=4092 task=000000003815f088 dev id=5 exp 0x500e004aaaaaaa1f phy5 addr=500e004aaaaaaa05 CQ hdr: 0x101b 0x50ffc 0x0 0x0 Error info: 0x8000 0x0 0x0 0x0
Thanks, John
But if freed sas_task is allocated by other IO before freeing sas_task at the end of the function, it frees other IO's sas_task actually which will cause memory issue.
thread 1 thread 2 allocate task0 free task0 allocate task0 as task0 is
freed already by thread 1 free task0
.
On 25/11/2021 12:58, chenxiang (M) wrote:
As explained above, I think that this is safe.
But between the 3rd sas_free_task and 4rd sas_free_task, it is possible that freed sas_task is allocated for other IO.
Of course. But the 4th call is a no-op as task is set to NULL after the 3rd call to sas_free_task().
Tmf IOs from different disks are asynchronized when sending ata reset (you can see following logs)
Please contact me via eSpace to help me understand.
[35133.522944] sas: Enter sas_scsi_recover_host busy: 0 failed: 0 [35133.538102] sas: ata58: end_device-2:0:0: dev error handler [35133.589002] sas: ata59: end_device-2:0:1: dev error handler [35133.633502] sas: ata60: end_device-2:0:2: dev error handler [35133.688151] sas: ata61: end_device-2:0:5: dev error handler [35135.898451] ata58.00: ATA-10: ST4000NM0035-1V4107, TN03, max UDMA/133 [35135.973229] ata59.00: ATA-10: ST4000NM0035-1V4107, TN03, max UDMA/133 [35135.986567] ata60.00: ATA-10: ST4000NM0035-1V4107, TN03, max UDMA/133 [35136.002164] ata61.00: ATA-10: ST4000NM0035-1V4107, TN03, max UDMA/133 [35151.121408] hisi_sas_v3_hw 0000:30:04.0: erroneous completion iptt=4076 task=00000000a6e52fa3 dev id=1 exp 0x500e004aaaaaaa1f phy0 addr=500e004aaaaaaa00 CQ hdr: 0x101b 0x10fec 0x0 0x0 Error info: 0x8000 0x0 0x0 0x0 [35151.149873] hisi_sas_v3_hw 0000:30:04.0: abort tmf: open reject failed [35151.268958] hisi_sas_v3_hw 0000:30:04.0: erroneous completion iptt=4077 task=0000000025465ccf dev id=1 exp 0x500e004aaaaaaa1f phy0 addr=500e004aaaaaaa00 CQ hdr: 0x101b 0x10fed 0x0 0x0 Error info: 0x8000 0x0 0x0 0x0 [35151.293686] hisi_sas_v3_hw 0000:30:04.0: abort tmf: open reject failed [35151.380960] hisi_sas_v3_hw 0000:30:04.0: erroneous completion iptt=4089 task=00000000b7661372 dev id=2 exp 0x500e004aaaaaaa1f phy1 addr=500e004aaaaaaa01 CQ hdr: 0x101b 0x20ff9 0x0 0x0 Error info: 0x8000 0x0 0x0 0x0 [35151.397819] hisi_sas_v3_hw 0000:30:04.0: erroneous completion iptt=4092 task=000000003815f088 dev id=5 exp 0x500e004aaaaaaa1f phy5 addr=500e004aaaaaaa05 CQ hdr: 0x101b 0x50ffc 0x0 0x0 Error info: 0x8000 0x0 0x0 0x0
From: Xiang Chen chenxiang66@hisilicon.com
The callers of function sas_discover_event() don't check the return value of it, and also it only returns 0, so use bool for its return code.
Signed-off-by: Xiang Chen chenxiang66@hisilicon.com --- drivers/scsi/libsas/sas_discover.c | 6 ++---- include/scsi/libsas.h | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-)
diff --git a/drivers/scsi/libsas/sas_discover.c b/drivers/scsi/libsas/sas_discover.c index 2faa7a372284..eef270a70754 100644 --- a/drivers/scsi/libsas/sas_discover.c +++ b/drivers/scsi/libsas/sas_discover.c @@ -556,19 +556,17 @@ static void sas_chain_event(int event, unsigned long *pending, } }
-int sas_discover_event(struct asd_sas_port *port, enum discover_event ev) +void sas_discover_event(struct asd_sas_port *port, enum discover_event ev) { struct sas_discovery *disc;
if (!port) - return 0; + return; disc = &port->disc;
BUG_ON(ev >= DISC_NUM_EVENTS);
sas_chain_event(ev, &disc->pending, &disc->disc_work[ev].work, port->ha); - - return 0; }
/** diff --git a/include/scsi/libsas.h b/include/scsi/libsas.h index 3d25d6728267..2bed0d67617a 100644 --- a/include/scsi/libsas.h +++ b/include/scsi/libsas.h @@ -686,7 +686,7 @@ void sas_ex_revalidate_domain(struct domain_device *port_dev, bool *retry);
void sas_unregister_domain_devices(struct asd_sas_port *port, int gone); void sas_init_disc(struct sas_discovery *disc, struct asd_sas_port *); -int sas_discover_event(struct asd_sas_port *, enum discover_event ev); +void sas_discover_event(struct asd_sas_port *, enum discover_event ev);
int sas_discover_sata(struct domain_device *); int sas_discover_end_dev(struct domain_device *);
On 23/11/2021 02:48, chenxiang wrote:
From: Xiang Chen chenxiang66@hisilicon.com
The callers of function sas_discover_event() don't check the return value of it, and also it only returns 0, so use bool for its return code.
You're making it return void, not bool, as far as I can see.
John
Signed-off-by: Xiang Chen chenxiang66@hisilicon.com
drivers/scsi/libsas/sas_discover.c | 6 ++---- include/scsi/libsas.h | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-)
diff --git a/drivers/scsi/libsas/sas_discover.c b/drivers/scsi/libsas/sas_discover.c index 2faa7a372284..eef270a70754 100644 --- a/drivers/scsi/libsas/sas_discover.c +++ b/drivers/scsi/libsas/sas_discover.c @@ -556,19 +556,17 @@ static void sas_chain_event(int event, unsigned long *pending, } }
-int sas_discover_event(struct asd_sas_port *port, enum discover_event ev) +void sas_discover_event(struct asd_sas_port *port, enum discover_event ev) { struct sas_discovery *disc;
if (!port)
return 0;
return;
disc = &port->disc;
BUG_ON(ev >= DISC_NUM_EVENTS);
sas_chain_event(ev, &disc->pending, &disc->disc_work[ev].work, port->ha);
return 0; }
/**
diff --git a/include/scsi/libsas.h b/include/scsi/libsas.h index 3d25d6728267..2bed0d67617a 100644 --- a/include/scsi/libsas.h +++ b/include/scsi/libsas.h @@ -686,7 +686,7 @@ void sas_ex_revalidate_domain(struct domain_device *port_dev, bool *retry);
void sas_unregister_domain_devices(struct asd_sas_port *port, int gone); void sas_init_disc(struct sas_discovery *disc, struct asd_sas_port *); -int sas_discover_event(struct asd_sas_port *, enum discover_event ev); +void sas_discover_event(struct asd_sas_port *, enum discover_event ev);
int sas_discover_sata(struct domain_device *); int sas_discover_end_dev(struct domain_device *);
在 2021/11/23 19:32, John Garry 写道:
On 23/11/2021 02:48, chenxiang wrote:
From: Xiang Chen chenxiang66@hisilicon.com
The callers of function sas_discover_event() don't check the return value of it, and also it only returns 0, so use bool for its return code.
You're making it return void, not bool, as far as I can see.
Right, it return void, not bool.
John
Signed-off-by: Xiang Chen chenxiang66@hisilicon.com
drivers/scsi/libsas/sas_discover.c | 6 ++---- include/scsi/libsas.h | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-)
diff --git a/drivers/scsi/libsas/sas_discover.c b/drivers/scsi/libsas/sas_discover.c index 2faa7a372284..eef270a70754 100644 --- a/drivers/scsi/libsas/sas_discover.c +++ b/drivers/scsi/libsas/sas_discover.c @@ -556,19 +556,17 @@ static void sas_chain_event(int event, unsigned long *pending, } } -int sas_discover_event(struct asd_sas_port *port, enum discover_event ev) +void sas_discover_event(struct asd_sas_port *port, enum discover_event ev) { struct sas_discovery *disc; if (!port)
return 0;
return; disc = &port->disc; BUG_ON(ev >= DISC_NUM_EVENTS); sas_chain_event(ev, &disc->pending,
&disc->disc_work[ev].work, port->ha);
- return 0; } /**
diff --git a/include/scsi/libsas.h b/include/scsi/libsas.h index 3d25d6728267..2bed0d67617a 100644 --- a/include/scsi/libsas.h +++ b/include/scsi/libsas.h @@ -686,7 +686,7 @@ void sas_ex_revalidate_domain(struct domain_device *port_dev, bool *retry); void sas_unregister_domain_devices(struct asd_sas_port *port, int gone); void sas_init_disc(struct sas_discovery *disc, struct asd_sas_port *); -int sas_discover_event(struct asd_sas_port *, enum discover_event ev); +void sas_discover_event(struct asd_sas_port *, enum discover_event ev); int sas_discover_sata(struct domain_device *); int sas_discover_end_dev(struct domain_device *);
.