From: Keith Busch keith.busch@intel.com
mainline inclusion from mainline-5.2-rc2 commit e43269e6e5c49d7fec599e6bba71963935b0e4ba category: bugfix bugzilla: 167363 CVE: NA
---------------------------
If a controller disabling didn't start a freeze, don't wait for the operation to complete.
Reviewed-by: Ming Lei ming.lei@redhat.com Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Keith Busch keith.busch@intel.com Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/pci.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index d295594da64d7..78ce02e476aad 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2185,7 +2185,7 @@ static void nvme_pci_disable(struct nvme_dev *dev)
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) { - bool dead = true; + bool dead = true, freeze = false; struct pci_dev *pdev = to_pci_dev(dev->dev);
mutex_lock(&dev->shutdown_lock); @@ -2193,8 +2193,10 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) u32 csts = readl(dev->bar + NVME_REG_CSTS);
if (dev->ctrl.state == NVME_CTRL_LIVE || - dev->ctrl.state == NVME_CTRL_RESETTING) + dev->ctrl.state == NVME_CTRL_RESETTING) { + freeze = true; nvme_start_freeze(&dev->ctrl); + } dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) || pdev->error_state != pci_channel_io_normal); } @@ -2203,10 +2205,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) * Give the controller a chance to complete all entered requests if * doing a safe shutdown. */ - if (!dead) { - if (shutdown) - nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT); - } + if (!dead && shutdown && freeze) + nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
nvme_stop_queues(&dev->ctrl);
From: Keith Busch keith.busch@intel.com
mainline inclusion from mainline-5.2-rc2 commit 39a9dd81f864aa20be896bb34b4bbc2501a2453d category: bugfix bugzilla: 167363 CVE: NA
---------------------------
The reset state doesn't dispatch commands that it needs to wait for anymore. If a timeout occurs in this state, the reset work is already disabling the controller, so just reset the request's timer.
Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Keith Busch keith.busch@intel.com
Conflicts: drivers/nvme/host/pci.c [ Cleanup patch 3b7dffb971dc("nvme-pci: mark expected switch fall-through") not applied. ]
Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 78ce02e476aad..e29b3113620df 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1188,13 +1188,14 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) case NVME_CTRL_DELETING: shutdown = true; case NVME_CTRL_CONNECTING: - case NVME_CTRL_RESETTING: dev_warn_ratelimited(dev->ctrl.device, "I/O %d QID %d timeout, disable controller\n", req->tag, nvmeq->qid); nvme_dev_disable(dev, shutdown); nvme_req(req)->flags |= NVME_REQ_CANCELLED; return BLK_EH_DONE; + case NVME_CTRL_RESETTING: + return BLK_EH_RESET_TIMER; default: break; }
From: Keith Busch keith.busch@intel.com
mainline inclusion from mainline-5.2-rc2 commit 2036f7263d70e67d70a67899a468588cb7356bc9 category: bugfix bugzilla: 167363 CVE: NA
---------------------------
The reset_work waits for queued IO to complete before setting the controller to live. If any of these times out and requeues, we won't be able to restart the controller because the reset_work is already running.
Flush all entered requests to a failed completion if a timeout occurs in the connecting state, and ensure the controller can't transition to the live state after we've unblocked it from waiting for completions.
Reviewed-by: Ming Lei ming.lei@redhat.com Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Keith Busch keith.busch@intel.com
Conflicts: drivers/nvme/host/pci.c [ Cleanup patch 3b7dffb971dc("nvme-pci: mark expected switch fall-through") not applied. ]
Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/pci.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index e29b3113620df..b4193b1731789 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1148,7 +1148,6 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) struct nvme_dev *dev = nvmeq->dev; struct request *abort_req; struct nvme_command cmd; - bool shutdown = false; u32 csts = readl(dev->bar + NVME_REG_CSTS);
/* If PCI error recovery process is happening, we cannot reset or @@ -1185,13 +1184,13 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) * shutdown, so we return BLK_EH_DONE. */ switch (dev->ctrl.state) { - case NVME_CTRL_DELETING: - shutdown = true; case NVME_CTRL_CONNECTING: + nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); + case NVME_CTRL_DELETING: dev_warn_ratelimited(dev->ctrl.device, "I/O %d QID %d timeout, disable controller\n", req->tag, nvmeq->qid); - nvme_dev_disable(dev, shutdown); + nvme_dev_disable(dev, true); nvme_req(req)->flags |= NVME_REQ_CANCELLED; return BLK_EH_DONE; case NVME_CTRL_RESETTING:
From: Laine Walker-Avina laine.walker-avina@intel.com
mainline inclusion from mainline-5.2-rc2 commit 2d466c7a574d0b893a233735f133c60115013c0e category: bugfix bugzilla: 167363 CVE: NA
---------------------------
We use the controller's reported maximum firmware activation time as our timeout before resetting a controller for a failed activation notice, but this value was never being read so we could only use the default timeout. Copy the Identify Controller MTFA field to the corresponding nvme_ctrl's mtfa field.
Fixes: b6dccf7fae433 (“nvme: add support for FW activation without reset”). Reviewed-by: Max Gurtovoy maxg@mellanox.com Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Minwoo Im minwoo.im@samsung.com Signed-off-by: Laine Walker-Avina laine.walker-avina@intel.com [changelog, fix endian] Signed-off-by: Keith Busch keith.busch@intel.com
Conflicts: drivers/nvme/host/core.c [ Cleanup 43e2d08d0790a0("nvme: avoid double dereference to convert le to cpu") is not applied. ]
Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/core.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b6ec73c77c18f..38b445df2f57c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2634,6 +2634,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
ctrl->oacs = le16_to_cpu(id->oacs); ctrl->oncs = le16_to_cpup(&id->oncs); + ctrl->mtfa = le16_to_cpu(id->mtfa); ctrl->oaes = le32_to_cpu(id->oaes); atomic_set(&ctrl->abort_limit, id->acl + 1); ctrl->vwc = id->vwc;
From: Tong Zhang ztong0001@gmail.com
mainline inclusion from mainline-5.9-rc4 commit 7ad92f656bddff4cf8f641e0e3b1acd4eb9644cb category: bugfix bugzilla: 167363 CVE: NA
---------------------------
This patch addresses an irq free warning and null pointer dereference error problem when nvme devices got timeout error during initialization. This problem happens when nvme_timeout() function is called while nvme_reset_work() is still in execution. This patch fixed the problem by setting flag of the problematic request to NVME_REQ_CANCELLED before calling nvme_dev_disable() to make sure __nvme_submit_sync_cmd() returns an error code and let nvme_submit_sync_cmd() fail gracefully. The following is console output.
[ 62.472097] nvme nvme0: I/O 13 QID 0 timeout, disable controller [ 62.488796] nvme nvme0: could not set timestamp (881) [ 62.494888] ------------[ cut here ]------------ [ 62.495142] Trying to free already-free IRQ 11 [ 62.495366] WARNING: CPU: 0 PID: 7 at kernel/irq/manage.c:1751 free_irq+0x1f7/0x370 [ 62.495742] Modules linked in: [ 62.495902] CPU: 0 PID: 7 Comm: kworker/u4:0 Not tainted 5.8.0+ #8 [ 62.496206] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-48-gd9c812dda519-p4 [ 62.496772] Workqueue: nvme-reset-wq nvme_reset_work [ 62.497019] RIP: 0010:free_irq+0x1f7/0x370 [ 62.497223] Code: e8 ce 49 11 00 48 83 c4 08 4c 89 e0 5b 5d 41 5c 41 5d 41 5e 41 5f c3 44 89 f6 48 c70 [ 62.498133] RSP: 0000:ffffa96800043d40 EFLAGS: 00010086 [ 62.498391] RAX: 0000000000000000 RBX: ffff9b87fc458400 RCX: 0000000000000000 [ 62.498741] RDX: 0000000000000001 RSI: 0000000000000096 RDI: ffffffff9693d72c [ 62.499091] RBP: ffff9b87fd4c8f60 R08: ffffa96800043bfd R09: 0000000000000163 [ 62.499440] R10: ffffa96800043bf8 R11: ffffa96800043bfd R12: ffff9b87fd4c8e00 [ 62.499790] R13: ffff9b87fd4c8ea4 R14: 000000000000000b R15: ffff9b87fd76b000 [ 62.500140] FS: 0000000000000000(0000) GS:ffff9b87fdc00000(0000) knlGS:0000000000000000 [ 62.500534] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 62.500816] CR2: 0000000000000000 CR3: 000000003aa0a000 CR4: 00000000000006f0 [ 62.501165] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 62.501515] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 62.501864] Call Trace: [ 62.501993] pci_free_irq+0x13/0x20 [ 62.502167] nvme_reset_work+0x5d0/0x12a0 [ 62.502369] ? update_load_avg+0x59/0x580 [ 62.502569] ? ttwu_queue_wakelist+0xa8/0xc0 [ 62.502780] ? try_to_wake_up+0x1a2/0x450 [ 62.502979] process_one_work+0x1d2/0x390 [ 62.503179] worker_thread+0x45/0x3b0 [ 62.503361] ? process_one_work+0x390/0x390 [ 62.503568] kthread+0xf9/0x130 [ 62.503726] ? kthread_park+0x80/0x80 [ 62.503911] ret_from_fork+0x22/0x30 [ 62.504090] ---[ end trace de9ed4a70f8d71e2 ]--- [ 123.912275] nvme nvme0: I/O 12 QID 0 timeout, disable controller [ 123.914670] nvme nvme0: 1/0/0 default/read/poll queues [ 123.916310] BUG: kernel NULL pointer dereference, address: 0000000000000000 [ 123.917469] #PF: supervisor write access in kernel mode [ 123.917725] #PF: error_code(0x0002) - not-present page [ 123.917976] PGD 0 P4D 0 [ 123.918109] Oops: 0002 [#1] SMP PTI [ 123.918283] CPU: 0 PID: 7 Comm: kworker/u4:0 Tainted: G W 5.8.0+ #8 [ 123.918650] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-48-gd9c812dda519-p4 [ 123.919219] Workqueue: nvme-reset-wq nvme_reset_work [ 123.919469] RIP: 0010:__blk_mq_alloc_map_and_request+0x21/0x80 [ 123.919757] Code: 66 0f 1f 84 00 00 00 00 00 41 55 41 54 55 48 63 ee 53 48 8b 47 68 89 ee 48 89 fb 8b4 [ 123.920657] RSP: 0000:ffffa96800043d40 EFLAGS: 00010286 [ 123.920912] RAX: ffff9b87fc4fee40 RBX: ffff9b87fc8cb008 RCX: 0000000000000000 [ 123.921258] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff9b87fc618000 [ 123.921602] RBP: 0000000000000000 R08: ffff9b87fdc2c4a0 R09: ffff9b87fc616000 [ 123.921949] R10: 0000000000000000 R11: ffff9b87fffd1500 R12: 0000000000000000 [ 123.922295] R13: 0000000000000000 R14: ffff9b87fc8cb200 R15: ffff9b87fc8cb000 [ 123.922641] FS: 0000000000000000(0000) GS:ffff9b87fdc00000(0000) knlGS:0000000000000000 [ 123.923032] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 123.923312] CR2: 0000000000000000 CR3: 000000003aa0a000 CR4: 00000000000006f0 [ 123.923660] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 123.924007] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 123.924353] Call Trace: [ 123.924479] blk_mq_alloc_tag_set+0x137/0x2a0 [ 123.924694] nvme_reset_work+0xed6/0x12a0 [ 123.924898] process_one_work+0x1d2/0x390 [ 123.925099] worker_thread+0x45/0x3b0 [ 123.925280] ? process_one_work+0x390/0x390 [ 123.925486] kthread+0xf9/0x130 [ 123.925642] ? kthread_park+0x80/0x80 [ 123.925825] ret_from_fork+0x22/0x30 [ 123.926004] Modules linked in: [ 123.926158] CR2: 0000000000000000 [ 123.926322] ---[ end trace de9ed4a70f8d71e3 ]--- [ 123.926549] RIP: 0010:__blk_mq_alloc_map_and_request+0x21/0x80 [ 123.926832] Code: 66 0f 1f 84 00 00 00 00 00 41 55 41 54 55 48 63 ee 53 48 8b 47 68 89 ee 48 89 fb 8b4 [ 123.927734] RSP: 0000:ffffa96800043d40 EFLAGS: 00010286 [ 123.927989] RAX: ffff9b87fc4fee40 RBX: ffff9b87fc8cb008 RCX: 0000000000000000 [ 123.928336] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff9b87fc618000 [ 123.928679] RBP: 0000000000000000 R08: ffff9b87fdc2c4a0 R09: ffff9b87fc616000 [ 123.929025] R10: 0000000000000000 R11: ffff9b87fffd1500 R12: 0000000000000000 [ 123.929370] R13: 0000000000000000 R14: ffff9b87fc8cb200 R15: ffff9b87fc8cb000 [ 123.929715] FS: 0000000000000000(0000) GS:ffff9b87fdc00000(0000) knlGS:0000000000000000 [ 123.930106] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 123.930384] CR2: 0000000000000000 CR3: 000000003aa0a000 CR4: 00000000000006f0 [ 123.930731] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 123.931077] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Co-developed-by: Keith Busch kbusch@kernel.org Signed-off-by: Tong Zhang ztong0001@gmail.com Reviewed-by: Keith Busch kbusch@kernel.org Signed-off-by: Sagi Grimberg sagi@grimberg.me Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index b4193b1731789..4e32e511c33ee 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1190,8 +1190,8 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) dev_warn_ratelimited(dev->ctrl.device, "I/O %d QID %d timeout, disable controller\n", req->tag, nvmeq->qid); - nvme_dev_disable(dev, true); nvme_req(req)->flags |= NVME_REQ_CANCELLED; + nvme_dev_disable(dev, true); return BLK_EH_DONE; case NVME_CTRL_RESETTING: return BLK_EH_RESET_TIMER; @@ -1208,10 +1208,10 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) dev_warn(dev->ctrl.device, "I/O %d QID %d timeout, reset controller\n", req->tag, nvmeq->qid); + nvme_req(req)->flags |= NVME_REQ_CANCELLED; nvme_dev_disable(dev, false); nvme_reset_ctrl(&dev->ctrl);
- nvme_req(req)->flags |= NVME_REQ_CANCELLED; return BLK_EH_DONE; }
From: Xianting Tian tian.xianting@h3c.com
mainline inclusion from mainline-5.9-rc7 commit 50b7c24390a53c78de546215282fb52980f1d7b7 category: bugfix bugzilla: 167363 CVE: NA
---------------------------
Currently, we use nvmeq->q_depth as the upper limit for a valid tag in nvme_handle_cqe(), it is not correct. Because the available tag number is recorded in tagset, which is not equal to nvmeq->q_depth.
The nvme driver registers interrupts for queues before initializing the tagset, because it uses the number of successful request_irq() calls to configure the tagset parameters. This allows a race condition with the current tag validity check if the controller happens to produce an interrupt with a corrupted CQE before the tagset is initialized.
Replace the driver's indirect tag check with the one already provided by the block layer.
Signed-off-by: Xianting Tian tian.xianting@h3c.com Reviewed-by: Keith Busch kbusch@kernel.org Signed-off-by: Christoph Hellwig hch@lst.de
Conflicts: drivers/nvme/host/pci.c [ Cleanup commit cfa27356f835dc ("nvme-pci: remove nvmeq->tags") is not applied. ]
Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/pci.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 4e32e511c33ee..fe4834b786658 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -889,13 +889,6 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) volatile struct nvme_completion *cqe = &nvmeq->cqes[idx]; struct request *req;
- if (unlikely(cqe->command_id >= nvmeq->q_depth)) { - dev_warn(nvmeq->dev->ctrl.device, - "invalid id %d completed on queue %d\n", - cqe->command_id, le16_to_cpu(cqe->sq_id)); - return; - } - /* * AEN requests are special as they don't time out and can * survive any kind of queue freeze and often don't respond to @@ -910,6 +903,13 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) }
req = blk_mq_tag_to_rq(*nvmeq->tags, cqe->command_id); + if (unlikely(!req)) { + dev_warn(nvmeq->dev->ctrl.device, + "invalid id %d completed on queue %d\n", + cqe->command_id, le16_to_cpu(cqe->sq_id)); + return; + } + nvme_end_request(req, cqe->status, cqe->result); }
From: Chaitanya Kulkarni chaitanya.kulkarni@wdc.com
mainline inclusion from mainline-5.9-rc7 commit 46d2613eae51d527ecaf0e8248a9bfcc0b92aa7e category: bugfix bugzilla: 167363 CVE: NA
---------------------------
In the function nvme_get_effects_log() it uses NVME_NSID_ALL which has namespace scope. The command effect log page is controller specific.
Replace NVME_NSID_ALL with 0x00 which specifies the controller scope instead of namespace scope.
Fixes: 84fef62d135b ("nvme: check admin passthru command effects") Link: https://bugzilla.kernel.org/show_bug.cgi?id=209287 Reported-by: Huai-Cheng Kuo hh81478072@gmail.com Signed-off-by: Chaitanya Kulkarni chaitanya.kulkarni@wdc.com Signed-off-by: Christoph Hellwig hch@lst.de
Conflicts: drivers/nvme/host/core.c [ Feature commit be93e87e78025("nvme: support for multiple Command Sets Supported and Effects log pages") is not applied. ]
Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 38b445df2f57c..bd3bc7ca39e40 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2552,7 +2552,7 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl) if (!ctrl->effects) return 0;
- ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0, + ret = nvme_get_log(ctrl, 0x00, NVME_LOG_CMD_EFFECTS, 0, ctrl->effects, sizeof(*ctrl->effects), 0); if (ret) { kfree(ctrl->effects);
From: Bart Van Assche bvanassche@acm.org
mainline inclusion from mainline-4.20-rc1 commit 202359c007f6b1d6247a062c0682d6d5bcd3e7d7 category: bugfix bugzilla: 167363 CVE: NA
---------------------------
The nvme_user_io.slba field is 64 bits wide. That value is copied into the 32-bit bio_integrity_payload.bip_iter.bi_sector field. Make that truncation explicit to avoid that Coverity complains about implicit truncation. See also Coverity ID 1056486 on http://scan.coverity.com/projects/linux.
Signed-off-by: Bart Van Assche bvanassche@acm.org Reviewed-by: Keith Busch keith.busch@intel.com Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index bd3bc7ca39e40..cfbd94c1e6625 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1255,7 +1255,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
return nvme_submit_user_cmd(ns->queue, &c, (void __user *)(uintptr_t)io.addr, length, - metadata, meta_len, io.slba, NULL, 0); + metadata, meta_len, lower_32_bits(io.slba), NULL, 0); }
static u32 nvme_known_admin_effects(u8 opcode)
From: Nick Bowler nbowler@draconx.ca
mainline inclusion from mainline-5.7-rc1 commit c95b708d5fa65b4e51f088ee077d127fd5a57b70 category: bugfix bugzilla: 167363 CVE: NA
---------------------------
On a 32-bit kernel, the upper bits of userspace addresses passed via various ioctls are silently ignored by the nvme driver.
However on a 64-bit kernel running a compat task, these upper bits are not ignored and are in fact required to be zero for the ioctls to work.
Unfortunately, this difference matters. 32-bit smartctl submits the NVME_IOCTL_ADMIN_CMD ioctl with garbage in these upper bits because it seems the pointer value it puts into the nvme_passthru_cmd structure is sign extended. This works fine on 32-bit kernels but fails on a 64-bit one because (at least on my setup) the addresses smartctl uses are consistently above 2G. For example:
# smartctl -x /dev/nvme0n1 smartctl 7.1 2019-12-30 r5022 [x86_64-linux-5.5.11] (local build) Copyright (C) 2002-19, Bruce Allen, Christian Franke, www.smartmontools.org
Read NVMe Identify Controller failed: NVME_IOCTL_ADMIN_CMD: Bad address
Since changing 32-bit kernels to actually check all of the submitted address bits now would break existing userspace, this patch fixes the compat problem by explicitly zeroing the upper bits in the compat case. This enables 32-bit smartctl to work on a 64-bit kernel.
Signed-off-by: Nick Bowler nbowler@draconx.ca Signed-off-by: Christoph Hellwig hch@lst.de
Conflicts: drivers/nvme/host/core.c [ Feature commit 65e68edce0db4("nvme: allow 64-bit results in passthru commands") is not applied. ]
Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/core.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index cfbd94c1e6625..3ebc4b5e0e90f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -14,6 +14,7 @@
#include <linux/blkdev.h> #include <linux/blk-mq.h> +#include <linux/compat.h> #include <linux/delay.h> #include <linux/errno.h> #include <linux/hdreg.h> @@ -1208,6 +1209,18 @@ static void nvme_enable_aen(struct nvme_ctrl *ctrl) queue_work(nvme_wq, &ctrl->async_event_work); }
+/* + * Convert integer values from ioctl structures to user pointers, silently + * ignoring the upper bits in the compat case to match behaviour of 32-bit + * kernels. + */ +static void __user *nvme_to_user_ptr(uintptr_t ptrval) +{ + if (in_compat_syscall()) + ptrval = (compat_uptr_t)ptrval; + return (void __user *)ptrval; +} + static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) { struct nvme_user_io io; @@ -1231,7 +1244,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
length = (io.nblocks + 1) << ns->lba_shift; meta_len = (io.nblocks + 1) * ns->ms; - metadata = (void __user *)(uintptr_t)io.metadata; + metadata = nvme_to_user_ptr(io.metadata);
if (ns->ext) { length += meta_len; @@ -1254,7 +1267,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) c.rw.appmask = cpu_to_le16(io.appmask);
return nvme_submit_user_cmd(ns->queue, &c, - (void __user *)(uintptr_t)io.addr, length, + nvme_to_user_ptr(io.addr), length, metadata, meta_len, lower_32_bits(io.slba), NULL, 0); }
@@ -1374,8 +1387,8 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
effects = nvme_passthru_start(ctrl, ns, cmd.opcode); status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, - (void __user *)(uintptr_t)cmd.addr, cmd.data_len, - (void __user *)(uintptr_t)cmd.metadata, cmd.metadata_len, + nvme_to_user_ptr(cmd.addr), cmd.data_len, + nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, 0, &cmd.result, timeout); nvme_passthru_end(ctrl, effects);
From: Revanth Rajashekar revanth.rajashekar@intel.com
mainline inclusion from mainline-5.11-rc5 commit 4d6b1c95b974761c01cbad92321b82232b66d2a2 category: bugfix bugzilla: 167363 CVE: NA
---------------------------
According to NVMe spec v1.4, section 8.3.1, the PRINFO bit and the metadata size play a vital role in deteriming the host buffer size.
If PRIFNO bit is set and MS==8, the host doesn't add the metadata buffer, instead the controller adds it.
Signed-off-by: Revanth Rajashekar revanth.rajashekar@intel.com Signed-off-by: Christoph Hellwig hch@lst.de
Conflicts: drivers/nvme/host/core.c [ Cleanup patch ffc89b1d3ca4("nvme: introduce namespace features flag") is not applied. ]
Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/core.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 3ebc4b5e0e90f..aab3d9a950a5e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1243,8 +1243,21 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) }
length = (io.nblocks + 1) << ns->lba_shift; - meta_len = (io.nblocks + 1) * ns->ms; - metadata = nvme_to_user_ptr(io.metadata); + + if ((io.control & NVME_RW_PRINFO_PRACT) && + ns->ms == sizeof(struct t10_pi_tuple)) { + /* + * Protection information is stripped/inserted by the + * controller. + */ + if (nvme_to_user_ptr(io.metadata)) + return -EINVAL; + meta_len = 0; + metadata = NULL; + } else { + meta_len = (io.nblocks + 1) * ns->ms; + metadata = nvme_to_user_ptr(io.metadata); + }
if (ns->ext) { length += meta_len;
From: Christoph Hellwig hch@lst.de
mainline inclusion from mainline-5.0-rc1 commit 4e224106673f1e8679249a9cac75712b896861b0 category: bugfix bugzilla: 167363 CVE: NA
---------------------------
This gets rid of all the messing with cq_vector and the ->polled field by using an atomic bitop to mark the queue enabled or not.
Signed-off-by: Christoph Hellwig hch@lst.de Reviewed-by: Keith Busch keith.busch@intel.com Signed-off-by: Jens Axboe axboe@kernel.dk
Conflicts: drivers/nvme/host/pci.c [ Non-bugfix patch 4b04cc6a8f8("nvme: add separate poll queue map") is not applied. ]
Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/pci.c | 42 ++++++++++++++++------------------------- 1 file changed, 16 insertions(+), 26 deletions(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index fe4834b786658..bc73499d38f7f 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -167,12 +167,14 @@ struct nvme_queue { dma_addr_t cq_dma_addr; u32 __iomem *q_db; u16 q_depth; - s16 cq_vector; + u16 cq_vector; u16 sq_tail; u16 cq_head; u16 last_cq_head; u16 qid; u8 cq_phase; + unsigned long flags; +#define NVMEQ_ENABLED 0 u32 *dbbuf_sq_db; u32 *dbbuf_cq_db; u32 *dbbuf_sq_ei; @@ -833,7 +835,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, * We should not need to do this, but we're still using this to * ensure we can drain requests on a dying queue. */ - if (unlikely(nvmeq->cq_vector < 0)) + if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags))) return BLK_STS_IOERR;
ret = nvme_setup_cmd(ns, req, &cmnd); @@ -1274,29 +1276,16 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest) */ static int nvme_suspend_queue(struct nvme_queue *nvmeq) { - int vector; - - spin_lock_irq(&nvmeq->cq_lock); - if (nvmeq->cq_vector == -1) { - spin_unlock_irq(&nvmeq->cq_lock); + if (!test_and_clear_bit(NVMEQ_ENABLED, &nvmeq->flags)) return 1; - } - vector = nvmeq->cq_vector; - nvmeq->dev->online_queues--; - nvmeq->cq_vector = -1; - spin_unlock_irq(&nvmeq->cq_lock);
- /* - * Ensure that nvme_queue_rq() sees it ->cq_vector == -1 without - * having to grab the lock. - */ + /* ensure that nvme_queue_rq() sees NVMEQ_ENABLED cleared */ mb();
+ nvmeq->dev->online_queues--; if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q) blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q); - - pci_free_irq(to_pci_dev(nvmeq->dev->dev), vector, nvmeq); - + pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq); return 0; }
@@ -1387,7 +1376,6 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth) nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; nvmeq->q_depth = depth; nvmeq->qid = qid; - nvmeq->cq_vector = -1; dev->ctrl.queue_count++;
return 0; @@ -1432,7 +1420,7 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) { struct nvme_dev *dev = nvmeq->dev; int result; - s16 vector; + u16 vector = 0;
if (dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) { unsigned offset = (qid - 1) * roundup(SQ_SIZE(nvmeq->q_depth), @@ -1467,10 +1455,10 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) if (result < 0) goto release_sq;
+ set_bit(NVMEQ_ENABLED, &nvmeq->flags); return result;
release_sq: - nvmeq->cq_vector = -1; dev->online_queues--; adapter_delete_sq(dev, qid); release_cq: @@ -1611,10 +1599,11 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) nvme_init_queue(nvmeq, 0); result = queue_request_irq(nvmeq); if (result) { - nvmeq->cq_vector = -1; + dev->online_queues--; return result; }
+ set_bit(NVMEQ_ENABLED, &nvmeq->flags); return result; }
@@ -1923,6 +1912,8 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) if (nr_io_queues == 0) return 0;
+ clear_bit(NVMEQ_ENABLED, &adminq->flags); + if (dev->cmb && (dev->cmbsz & NVME_CMBSZ_SQS)) { result = nvme_cmb_qdepth(dev, nr_io_queues, sizeof(struct nvme_command)); @@ -1965,10 +1956,9 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) * number of interrupts. */ result = queue_request_irq(adminq); - if (result) { - adminq->cq_vector = -1; + if (result) return result; - } + set_bit(NVMEQ_ENABLED, &adminq->flags);
result = nvme_create_io_queues(dev); if (result || dev->online_queues < 2)