Skip to content

Commit e4b9852

Browse files
Casey ChenChristoph Hellwig
authored andcommitted
nvme-pci: fix multiple races in nvme_setup_io_queues
Below two paths could overlap each other if we power off a drive quickly after powering it on. There are multiple races in nvme_setup_io_queues() because of shutdown_lock missing and improper use of NVMEQ_ENABLED bit. nvme_reset_work() nvme_remove() nvme_setup_io_queues() nvme_dev_disable() ... ... A1 clear NVMEQ_ENABLED bit for admin queue lock retry: B1 nvme_suspend_io_queues() A2 pci_free_irq() admin queue B2 nvme_suspend_queue() admin queue A3 pci_free_irq_vectors() nvme_pci_disable() A4 nvme_setup_irqs(); B3 pci_free_irq_vectors() ... unlock A5 queue_request_irq() for admin queue set NVMEQ_ENABLED bit ... nvme_create_io_queues() A6 result = queue_request_irq(); set NVMEQ_ENABLED bit ... fail to allocate enough IO queues: A7 nvme_suspend_io_queues() goto retry If B3 runs in between A1 and A2, it will crash if irqaction haven't been freed by A2. B2 is supposed to free admin queue IRQ but it simply can't fulfill the job as A1 has cleared NVMEQ_ENABLED bit. Fix: combine A1 A2 so IRQ get freed as soon as the NVMEQ_ENABLED bit gets cleared. After solved #1, A2 could race with B3 if A2 is freeing IRQ while B3 is checking irqaction. A3 also could race with B2 if B2 is freeing IRQ while A3 is checking irqaction. Fix: A2 and A3 take lock for mutual exclusion. A3 could race with B3 since they could run free_msi_irqs() in parallel. Fix: A3 takes lock for mutual exclusion. A4 could fail to allocate all needed IRQ vectors if A3 and A4 are interrupted by B3. Fix: A4 takes lock for mutual exclusion. If A5/A6 happened after B2/B1, B3 will crash since irqaction is not NULL. They are just allocated by A5/A6. Fix: Lock queue_request_irq() and setting of NVMEQ_ENABLED bit. A7 could get chance to pci_free_irq() for certain IO queue while B3 is checking irqaction. Fix: A7 takes lock. nvme_dev->online_queues need to be protected by shutdown_lock. Since it is not atomic, both paths could modify it using its own copy. Co-developed-by: Yuanyuan Zhong <[email protected]> Signed-off-by: Casey Chen <[email protected]> Reviewed-by: Keith Busch <[email protected]> Signed-off-by: Christoph Hellwig <[email protected]>
1 parent 8b43ced commit e4b9852

File tree

1 file changed

+58
-8
lines changed

1 file changed

+58
-8
lines changed

drivers/nvme/host/pci.c

Lines changed: 58 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1554,6 +1554,28 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
15541554
wmb(); /* ensure the first interrupt sees the initialization */
15551555
}
15561556

1557+
/*
1558+
* Try getting shutdown_lock while setting up IO queues.
1559+
*/
1560+
static int nvme_setup_io_queues_trylock(struct nvme_dev *dev)
1561+
{
1562+
/*
1563+
* Give up if the lock is being held by nvme_dev_disable.
1564+
*/
1565+
if (!mutex_trylock(&dev->shutdown_lock))
1566+
return -ENODEV;
1567+
1568+
/*
1569+
* Controller is in wrong state, fail early.
1570+
*/
1571+
if (dev->ctrl.state != NVME_CTRL_CONNECTING) {
1572+
mutex_unlock(&dev->shutdown_lock);
1573+
return -ENODEV;
1574+
}
1575+
1576+
return 0;
1577+
}
1578+
15571579
static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
15581580
{
15591581
struct nvme_dev *dev = nvmeq->dev;
@@ -1582,19 +1604,24 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
15821604
goto release_cq;
15831605

15841606
nvmeq->cq_vector = vector;
1585-
nvme_init_queue(nvmeq, qid);
15861607

1608+
result = nvme_setup_io_queues_trylock(dev);
1609+
if (result)
1610+
return result;
1611+
nvme_init_queue(nvmeq, qid);
15871612
if (!polled) {
15881613
result = queue_request_irq(nvmeq);
15891614
if (result < 0)
15901615
goto release_sq;
15911616
}
15921617

15931618
set_bit(NVMEQ_ENABLED, &nvmeq->flags);
1619+
mutex_unlock(&dev->shutdown_lock);
15941620
return result;
15951621

15961622
release_sq:
15971623
dev->online_queues--;
1624+
mutex_unlock(&dev->shutdown_lock);
15981625
adapter_delete_sq(dev, qid);
15991626
release_cq:
16001627
adapter_delete_cq(dev, qid);
@@ -2167,7 +2194,18 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
21672194
if (nr_io_queues == 0)
21682195
return 0;
21692196

2170-
clear_bit(NVMEQ_ENABLED, &adminq->flags);
2197+
/*
2198+
* Free IRQ resources as soon as NVMEQ_ENABLED bit transitions
2199+
* from set to unset. If there is a window to it is truely freed,
2200+
* pci_free_irq_vectors() jumping into this window will crash.
2201+
* And take lock to avoid racing with pci_free_irq_vectors() in
2202+
* nvme_dev_disable() path.
2203+
*/
2204+
result = nvme_setup_io_queues_trylock(dev);
2205+
if (result)
2206+
return result;
2207+
if (test_and_clear_bit(NVMEQ_ENABLED, &adminq->flags))
2208+
pci_free_irq(pdev, 0, adminq);
21712209

21722210
if (dev->cmb_use_sqes) {
21732211
result = nvme_cmb_qdepth(dev, nr_io_queues,
@@ -2183,14 +2221,17 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
21832221
result = nvme_remap_bar(dev, size);
21842222
if (!result)
21852223
break;
2186-
if (!--nr_io_queues)
2187-
return -ENOMEM;
2224+
if (!--nr_io_queues) {
2225+
result = -ENOMEM;
2226+
goto out_unlock;
2227+
}
21882228
} while (1);
21892229
adminq->q_db = dev->dbs;
21902230

21912231
retry:
21922232
/* Deregister the admin queue's interrupt */
2193-
pci_free_irq(pdev, 0, adminq);
2233+
if (test_and_clear_bit(NVMEQ_ENABLED, &adminq->flags))
2234+
pci_free_irq(pdev, 0, adminq);
21942235

21952236
/*
21962237
* If we enable msix early due to not intx, disable it again before
@@ -2199,8 +2240,10 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
21992240
pci_free_irq_vectors(pdev);
22002241

22012242
result = nvme_setup_irqs(dev, nr_io_queues);
2202-
if (result <= 0)
2203-
return -EIO;
2243+
if (result <= 0) {
2244+
result = -EIO;
2245+
goto out_unlock;
2246+
}
22042247

22052248
dev->num_vecs = result;
22062249
result = max(result - 1, 1);
@@ -2214,8 +2257,9 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
22142257
*/
22152258
result = queue_request_irq(adminq);
22162259
if (result)
2217-
return result;
2260+
goto out_unlock;
22182261
set_bit(NVMEQ_ENABLED, &adminq->flags);
2262+
mutex_unlock(&dev->shutdown_lock);
22192263

22202264
result = nvme_create_io_queues(dev);
22212265
if (result || dev->online_queues < 2)
@@ -2224,6 +2268,9 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
22242268
if (dev->online_queues - 1 < dev->max_qid) {
22252269
nr_io_queues = dev->online_queues - 1;
22262270
nvme_disable_io_queues(dev);
2271+
result = nvme_setup_io_queues_trylock(dev);
2272+
if (result)
2273+
return result;
22272274
nvme_suspend_io_queues(dev);
22282275
goto retry;
22292276
}
@@ -2232,6 +2279,9 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
22322279
dev->io_queues[HCTX_TYPE_READ],
22332280
dev->io_queues[HCTX_TYPE_POLL]);
22342281
return 0;
2282+
out_unlock:
2283+
mutex_unlock(&dev->shutdown_lock);
2284+
return result;
22352285
}
22362286

22372287
static void nvme_del_queue_end(struct request *req, blk_status_t error)

0 commit comments

Comments
 (0)