Skip to content

Commit 7c4c84c

Browse files
PatrisiousHaddadgregkh
authored andcommitted
RDMA/mlx5: Fix error flow upon firmware failure for RQ destruction
[ Upstream commit 5d2ea5a ] Upon RQ destruction if the firmware command fails which is the last resource to be destroyed some SW resources were already cleaned regardless of the failure. Now properly rollback the object to its original state upon such failure. In order to avoid a use-after free in case someone tries to destroy the object again, which results in the following kernel trace: refcount_t: underflow; use-after-free. WARNING: CPU: 0 PID: 37589 at lib/refcount.c:28 refcount_warn_saturate+0xf4/0x148 Modules linked in: rdma_ucm(OE) rdma_cm(OE) iw_cm(OE) ib_ipoib(OE) ib_cm(OE) ib_umad(OE) mlx5_ib(OE) rfkill mlx5_core(OE) mlxdevm(OE) ib_uverbs(OE) ib_core(OE) psample mlxfw(OE) mlx_compat(OE) macsec tls pci_hyperv_intf sunrpc vfat fat virtio_net net_failover failover fuse loop nfnetlink vsock_loopback vmw_vsock_virtio_transport_common vmw_vsock_vmci_transport vmw_vmci vsock xfs crct10dif_ce ghash_ce sha2_ce sha256_arm64 sha1_ce virtio_console virtio_gpu virtio_blk virtio_dma_buf virtio_mmio dm_mirror dm_region_hash dm_log dm_mod xpmem(OE) CPU: 0 UID: 0 PID: 37589 Comm: python3 Kdump: loaded Tainted: G OE ------- --- 6.12.0-54.el10.aarch64 #1 Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : refcount_warn_saturate+0xf4/0x148 lr : refcount_warn_saturate+0xf4/0x148 sp : ffff80008b81b7e0 x29: ffff80008b81b7e0 x28: ffff000133d51600 x27: 0000000000000001 x26: 0000000000000000 x25: 00000000ffffffea x24: ffff00010ae80f00 x23: ffff00010ae80f80 x22: ffff0000c66e5d08 x21: 0000000000000000 x20: ffff0000c66e0000 x19: ffff00010ae80340 x18: 0000000000000006 x17: 0000000000000000 x16: 0000000000000020 x15: ffff80008b81b37f x14: 0000000000000000 x13: 2e656572662d7265 x12: ffff80008283ef78 x11: ffff80008257efd0 x10: ffff80008283efd0 x9 : ffff80008021ed90 x8 : 0000000000000001 x7 : 00000000000bffe8 x6 : c0000000ffff7fff x5 : ffff0001fb8e3408 x4 : 0000000000000000 x3 : ffff800179993000 x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff000133d51600 Call trace: refcount_warn_saturate+0xf4/0x148 mlx5_core_put_rsc+0x88/0xa0 [mlx5_ib] mlx5_core_destroy_rq_tracked+0x64/0x98 [mlx5_ib] mlx5_ib_destroy_wq+0x34/0x80 [mlx5_ib] ib_destroy_wq_user+0x30/0xc0 [ib_core] uverbs_free_wq+0x28/0x58 [ib_uverbs] destroy_hw_idr_uobject+0x34/0x78 [ib_uverbs] uverbs_destroy_uobject+0x48/0x240 [ib_uverbs] __uverbs_cleanup_ufile+0xd4/0x1a8 [ib_uverbs] uverbs_destroy_ufile_hw+0x48/0x120 [ib_uverbs] ib_uverbs_close+0x2c/0x100 [ib_uverbs] __fput+0xd8/0x2f0 __fput_sync+0x50/0x70 __arm64_sys_close+0x40/0x90 invoke_syscall.constprop.0+0x74/0xd0 do_el0_svc+0x48/0xe8 el0_svc+0x44/0x1d0 el0t_64_sync_handler+0x120/0x130 el0t_64_sync+0x1a4/0x1a8 Fixes: e2013b2 ("net/mlx5_core: Add RQ and SQ event handling") Signed-off-by: Patrisious Haddad <[email protected]> Link: https://patch.msgid.link/3181433ccdd695c63560eeeb3f0c990961732101.1745839855.git.leon@kernel.org Signed-off-by: Leon Romanovsky <[email protected]> Signed-off-by: Sasha Levin <[email protected]>
1 parent 5e4519d commit 7c4c84c

File tree

2 files changed

+29
-2
lines changed

2 files changed

+29
-2
lines changed

drivers/infiniband/hw/mlx5/qpc.c

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,10 @@ mlx5_get_rsc(struct mlx5_qp_table *table, u32 rsn)
2121
spin_lock_irqsave(&table->lock, flags);
2222

2323
common = radix_tree_lookup(&table->tree, rsn);
24-
if (common)
24+
if (common && !common->invalid)
2525
refcount_inc(&common->refcount);
26+
else
27+
common = NULL;
2628

2729
spin_unlock_irqrestore(&table->lock, flags);
2830

@@ -178,6 +180,18 @@ static int create_resource_common(struct mlx5_ib_dev *dev,
178180
return 0;
179181
}
180182

183+
static void modify_resource_common_state(struct mlx5_ib_dev *dev,
184+
struct mlx5_core_qp *qp,
185+
bool invalid)
186+
{
187+
struct mlx5_qp_table *table = &dev->qp_table;
188+
unsigned long flags;
189+
190+
spin_lock_irqsave(&table->lock, flags);
191+
qp->common.invalid = invalid;
192+
spin_unlock_irqrestore(&table->lock, flags);
193+
}
194+
181195
static void destroy_resource_common(struct mlx5_ib_dev *dev,
182196
struct mlx5_core_qp *qp)
183197
{
@@ -604,8 +618,20 @@ int mlx5_core_create_rq_tracked(struct mlx5_ib_dev *dev, u32 *in, int inlen,
604618
int mlx5_core_destroy_rq_tracked(struct mlx5_ib_dev *dev,
605619
struct mlx5_core_qp *rq)
606620
{
621+
int ret;
622+
623+
/* The rq destruction can be called again in case it fails, hence we
624+
* mark the common resource as invalid and only once FW destruction
625+
* is completed successfully we actually destroy the resources.
626+
*/
627+
modify_resource_common_state(dev, rq, true);
628+
ret = destroy_rq_tracked(dev, rq->qpn, rq->uid);
629+
if (ret) {
630+
modify_resource_common_state(dev, rq, false);
631+
return ret;
632+
}
607633
destroy_resource_common(dev, rq);
608-
return destroy_rq_tracked(dev, rq->qpn, rq->uid);
634+
return 0;
609635
}
610636

611637
static void destroy_sq_tracked(struct mlx5_ib_dev *dev, u32 sqn, u16 uid)

include/linux/mlx5/driver.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,7 @@ struct mlx5_core_rsc_common {
390390
enum mlx5_res_type res;
391391
refcount_t refcount;
392392
struct completion free;
393+
bool invalid;
393394
};
394395

395396
struct mlx5_uars_page {

0 commit comments

Comments
 (0)