Skip to content

Commit 11dde18

Browse files
Avinash Repakavijay-suman
Avinash Repaka
authored andcommitted
RDS: IB: Post invalidation & registration WRs together
Post invalidation work-request and registration work-request together, instead of posting invalidation request during MR pool flushing. This reduces the MR work completions in half and increases the throughput in multi-threaded scenarios. The best performance improvements were observed when RDS_GET_MR socket option was used to register MR. For reference, below is a set of rds-stress results observed on Exadata X4-2 setup, which uses CX3, for -t 16 -d 16 -D 8K --rdma-use-get-mr 1 options: w/o patch - 839394.7 KB/s w/ patch - 1382444.2 KB/s Note that this patch also moves the teardown function, which unmaps & unpins pages associated with an MR, along with invalidation, from flushing function to mapping function. Orabug: 27602183 Signed-off-by: Avinash Repaka <[email protected]> Tested-by: Anand Bibhuti <[email protected]> Tested-by: Efrain Galaviz <[email protected]> Tested-by: Rosa Lopez <[email protected]> Tested-by: Gerald Gibson <[email protected]> Reviewed-by: Wei Lin Guay <[email protected]> Acked-by: Santosh Shilimkar <[email protected]> Signed-off-by: Somasundaram Krishnasamy <[email protected]> Orabug: 33590097 UEK6 => UEK7 (cherry picked from commit 1c14156) cherry-pick-repo=UEK/production/linux-uek.git Signed-off-by: Gerd Rausch <[email protected]> Reviewed-by: William Kucharski <[email protected]> Orabug: 33590087 UEK7 => LUCI (cherry picked from commit 461ade0) cherry-pick-repo=UEK/production/linux-uek.git Signed-off-by: Gerd Rausch <[email protected]> Reviewed-by: William Kucharski <[email protected]>
1 parent 50c4e90 commit 11dde18

File tree

2 files changed

+98
-115
lines changed

2 files changed

+98
-115
lines changed

net/rds/ib_cm.c

+1
Original file line numberDiff line numberDiff line change
@@ -1662,6 +1662,7 @@ int rds_ib_setup_fastreg(struct rds_ib_device *rds_ibdev)
16621662
qp_init_attr.cap.max_recv_wr = 0;
16631663
qp_init_attr.cap.max_send_sge = 0;
16641664
qp_init_attr.cap.max_recv_sge = 0;
1665+
qp_init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
16651666

16661667
rds_ibdev->fastreg_qp = ib_create_qp(rds_ibdev->pd, &qp_init_attr);
16671668
if (IS_ERR(rds_ibdev->fastreg_qp)) {

net/rds/ib_rdma.c

+97-115
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@ enum rds_ib_fr_state {
4949
MR_IS_STALE, /* mr is possibly corrupt, marked if failure */
5050
};
5151

52+
#define RDS_MR_INV_WR_ID ((u64)0xefefefefefefefefULL)
53+
5254
/*
5355
* This is stored as mr->r_trans_private.
5456
*/
@@ -120,7 +122,6 @@ static void rds_ib_mr_pool_flush_worker(struct work_struct *work);
120122
static int rds_ib_map_fastreg_mr(struct rds_ib_device *rds_ibdev,
121123
struct rds_ib_mr *ibmr,
122124
struct scatterlist *sg, unsigned int sg_len);
123-
static int rds_ib_fastreg_inv(struct rds_ib_mr *ibmr);
124125

125126
static struct rds_ib_device *rds_ib_get_device(struct in6_addr *ipaddr)
126127
{
@@ -869,20 +870,18 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
869870
ret = ib_unmap_fmr(&fmr_list);
870871
if (ret)
871872
pr_warn("RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret);
872-
} else {
873-
list_for_each_entry(ibmr, &unmap_list, unmap_list) {
874-
ret = rds_ib_fastreg_inv(ibmr);
875-
if (ret)
876-
pr_warn_ratelimited(
877-
"RDS/IB: rds_ib_fastreg_inv failed (err=%d)\n",
878-
ret);
879-
}
880873
}
881874

882875
/* Now we can destroy the DMA mapping and unpin any pages */
883876
list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) {
877+
/* Teardown only FMRs here, teardown fastreg MRs later after
878+
* invalidating. However, increment 'unpinned' for both, since
879+
* it is used to trigger flush.
880+
*/
884881
unpinned += ibmr->sg_len;
885-
__rds_ib_teardown_mr(ibmr);
882+
if (!pool->use_fastreg)
883+
__rds_ib_teardown_mr(ibmr);
884+
886885
if (nfreed < free_goal ||
887886
(!pool->use_fastreg &&
888887
ibmr->remap_count >= pool->fmr_attr.max_maps) ||
@@ -893,6 +892,7 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
893892
rds_ib_stats_inc(s_ib_rdma_mr_1m_free);
894893
list_del(&ibmr->unmap_list);
895894
if (pool->use_fastreg) {
895+
__rds_ib_teardown_mr(ibmr);
896896
if (ibmr->page_list)
897897
ib_free_fast_reg_page_list(ibmr->page_list);
898898
if (ibmr->mr)
@@ -1089,15 +1089,16 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
10891089
/* Fastreg related functions */
10901090

10911091
static int rds_ib_map_scatterlist(struct rds_ib_device *rds_ibdev,
1092-
struct rds_ib_mr *ibmr)
1092+
struct rds_ib_mr *ibmr,
1093+
struct scatterlist *sg, unsigned int sg_len)
10931094
{
10941095
struct ib_device *dev = rds_ibdev->dev;
10951096
int i, j, ret, page_cnt;
10961097
u32 len;
1098+
int sg_dma_len;
10971099

1098-
ibmr->sg_dma_len = ib_dma_map_sg(dev, ibmr->sg, ibmr->sg_len,
1099-
DMA_BIDIRECTIONAL);
1100-
if (unlikely(!ibmr->sg_dma_len)) {
1100+
sg_dma_len = ib_dma_map_sg(dev, sg, sg_len, DMA_BIDIRECTIONAL);
1101+
if (unlikely(!sg_dma_len)) {
11011102
pr_warn("RDS/IB: dma_map_sg failed!\n");
11021103
return -EBUSY;
11031104
}
@@ -1107,9 +1108,9 @@ static int rds_ib_map_scatterlist(struct rds_ib_device *rds_ibdev,
11071108
len = 0;
11081109

11091110
ret = -EINVAL;
1110-
for (i = 0; i < ibmr->sg_dma_len; ++i) {
1111-
unsigned int dma_len = ib_sg_dma_len(dev, &ibmr->sg[i]);
1112-
u64 dma_addr = ib_sg_dma_address(dev, &ibmr->sg[i]);
1111+
for (i = 0; i < sg_dma_len; ++i) {
1112+
unsigned int dma_len = ib_sg_dma_len(dev, &sg[i]);
1113+
u64 dma_addr = ib_sg_dma_address(dev, &sg[i]);
11131114

11141115
ibmr->sg_byte_len += dma_len;
11151116
if (dma_addr & ~PAGE_MASK) {
@@ -1120,7 +1121,7 @@ static int rds_ib_map_scatterlist(struct rds_ib_device *rds_ibdev,
11201121
}
11211122

11221123
if ((dma_addr + dma_len) & ~PAGE_MASK) {
1123-
if (i < ibmr->sg_dma_len - 1)
1124+
if (i < sg_dma_len - 1)
11241125
goto out_unmap;
11251126
else
11261127
++ibmr->dma_npages;
@@ -1137,31 +1138,38 @@ static int rds_ib_map_scatterlist(struct rds_ib_device *rds_ibdev,
11371138
}
11381139

11391140
page_cnt = 0;
1140-
for (i = 0; i < ibmr->sg_dma_len; ++i) {
1141-
unsigned int dma_len = ib_sg_dma_len(dev, &ibmr->sg[i]);
1142-
u64 dma_addr = ib_sg_dma_address(dev, &ibmr->sg[i]);
1141+
for (i = 0; i < sg_dma_len; ++i) {
1142+
unsigned int dma_len = ib_sg_dma_len(dev, &sg[i]);
1143+
u64 dma_addr = ib_sg_dma_address(dev, &sg[i]);
11431144

11441145
for (j = 0; j < dma_len; j += PAGE_SIZE)
11451146
ibmr->page_list->page_list[page_cnt++] =
11461147
(dma_addr & PAGE_MASK) + j;
11471148
}
11481149

11491150
ibmr->dma_npages = page_cnt;
1150-
return 0;
1151+
return sg_dma_len;
11511152

11521153
out_unmap:
1154+
if (sg_dma_len)
1155+
ib_dma_unmap_sg(rds_ibdev->dev, sg, sg_len, DMA_BIDIRECTIONAL);
11531156
return ret;
11541157
}
11551158

11561159
static int rds_ib_rdma_build_fastreg(struct rds_ib_device *rds_ibdev,
11571160
struct rds_ib_mr *ibmr)
11581161
{
1159-
struct ib_fast_reg_wr f_wr;
1160-
struct ib_send_wr *failed_wr;
1162+
struct ib_fast_reg_wr fastreg_wr;
1163+
struct ib_send_wr inv_wr, *failed_wr, *first_wr = NULL;
11611164
struct ib_qp *qp;
11621165
atomic_t *n_wrs;
11631166
int ret = 0;
11641167

1168+
if (ibmr->fr_state == MR_IS_STALE) {
1169+
WARN_ON(true);
1170+
return -EAGAIN;
1171+
}
1172+
11651173
if (ibmr->ic) {
11661174
n_wrs = &ibmr->ic->i_fastreg_wrs;
11671175
qp = ibmr->ic->i_cm_id->qp;
@@ -1171,38 +1179,48 @@ static int rds_ib_rdma_build_fastreg(struct rds_ib_device *rds_ibdev,
11711179
qp = rds_ibdev->fastreg_qp;
11721180
}
11731181

1174-
while (atomic_dec_return(n_wrs) <= 0) {
1175-
atomic_inc(n_wrs);
1182+
while (atomic_sub_return(2, n_wrs) <= 0) {
1183+
atomic_add(2, n_wrs);
11761184
/* Depending on how many times schedule() is called,
11771185
* we could replace it with wait_event() in future.
11781186
*/
11791187
schedule();
11801188
}
11811189

1190+
if (ibmr->fr_state == MR_IS_VALID) {
1191+
memset(&inv_wr, 0, sizeof(inv_wr));
1192+
inv_wr.wr_id = RDS_MR_INV_WR_ID;
1193+
inv_wr.opcode = IB_WR_LOCAL_INV;
1194+
inv_wr.ex.invalidate_rkey = ibmr->mr->rkey;
1195+
first_wr = &inv_wr;
1196+
} else
1197+
ibmr->fr_state = MR_IS_VALID;
1198+
11821199
ib_update_fast_reg_key(ibmr->mr, ibmr->remap_count++);
1183-
WARN_ON(ibmr->fr_state != MR_IS_INVALID);
1184-
ibmr->fr_state = MR_IS_VALID;
1185-
1186-
memset(&f_wr, 0, sizeof(f_wr));
1187-
f_wr.wr.wr_id = (u64)ibmr;
1188-
f_wr.wr.opcode = IB_WR_FAST_REG_MR;
1189-
f_wr.length = ibmr->sg_byte_len;
1190-
f_wr.rkey = ibmr->mr->rkey;
1191-
f_wr.page_list = ibmr->page_list;
1192-
f_wr.page_list_len = ibmr->dma_npages;
1193-
f_wr.page_shift = PAGE_SHIFT;
1194-
f_wr.access_flags = IB_ACCESS_LOCAL_WRITE |
1195-
IB_ACCESS_REMOTE_READ |
1196-
IB_ACCESS_REMOTE_WRITE;
1197-
f_wr.iova_start = 0;
1198-
f_wr.wr.send_flags = IB_SEND_SIGNALED;
1199-
1200-
failed_wr = &f_wr.wr;
1201-
ret = ib_post_send(qp, &f_wr.wr, &failed_wr);
1202-
BUG_ON(failed_wr != &f_wr.wr);
1200+
1201+
memset(&fastreg_wr, 0, sizeof(fastreg_wr));
1202+
fastreg_wr.wr.wr_id = (u64)ibmr;
1203+
fastreg_wr.wr.opcode = IB_WR_FAST_REG_MR;
1204+
fastreg_wr.length = ibmr->sg_byte_len;
1205+
fastreg_wr.rkey = ibmr->mr->rkey;
1206+
fastreg_wr.page_list = ibmr->page_list;
1207+
fastreg_wr.page_list_len = ibmr->dma_npages;
1208+
fastreg_wr.page_shift = PAGE_SHIFT;
1209+
fastreg_wr.access_flags = IB_ACCESS_LOCAL_WRITE |
1210+
IB_ACCESS_REMOTE_READ |
1211+
IB_ACCESS_REMOTE_WRITE;
1212+
fastreg_wr.iova_start = 0;
1213+
fastreg_wr.wr.send_flags = IB_SEND_SIGNALED;
1214+
1215+
if (!first_wr)
1216+
first_wr = &fastreg_wr.wr;
1217+
else
1218+
first_wr->next = &fastreg_wr.wr;
1219+
1220+
ret = ib_post_send(qp, first_wr, &failed_wr);
12031221
if (ret) {
1204-
atomic_inc(n_wrs);
1205-
ibmr->fr_state = MR_IS_INVALID;
1222+
atomic_add(2, n_wrs);
1223+
ibmr->fr_state = MR_IS_STALE;
12061224
pr_warn_ratelimited("RDS/IB: %s:%d ib_post_send returned %d\n",
12071225
__func__, __LINE__, ret);
12081226
goto out;
@@ -1225,23 +1243,26 @@ static int rds_ib_map_fastreg_mr(struct rds_ib_device *rds_ibdev,
12251243
struct scatterlist *sg, unsigned int sg_len)
12261244
{
12271245
int ret = 0;
1246+
int sg_dma_len = 0;
12281247

1229-
/* We want to teardown old ibmr values here and fill it up with
1230-
* new sg values
1231-
*/
1232-
rds_ib_teardown_mr(ibmr);
1233-
1234-
ibmr->sg = sg;
1235-
ibmr->sg_len = sg_len;
1236-
1237-
ret = rds_ib_map_scatterlist(rds_ibdev, ibmr);
1238-
if (ret)
1248+
ret = rds_ib_map_scatterlist(rds_ibdev, ibmr, sg, sg_len);
1249+
if (ret < 0)
12391250
goto out;
1251+
sg_dma_len = ret;
12401252

12411253
ret = rds_ib_rdma_build_fastreg(rds_ibdev, ibmr);
12421254
if (ret)
12431255
goto out;
12441256

1257+
/* Teardown previous values here since we
1258+
* finished invalidating the previous key
1259+
*/
1260+
__rds_ib_teardown_mr(ibmr);
1261+
1262+
ibmr->sg = sg;
1263+
ibmr->sg_len = sg_len;
1264+
ibmr->sg_dma_len = sg_dma_len;
1265+
12451266
if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
12461267
rds_ib_stats_inc(s_ib_rdma_mr_8k_used);
12471268
else
@@ -1250,56 +1271,21 @@ static int rds_ib_map_fastreg_mr(struct rds_ib_device *rds_ibdev,
12501271
return ret;
12511272

12521273
out:
1253-
if (ibmr->sg_dma_len) {
1254-
ib_dma_unmap_sg(rds_ibdev->dev, ibmr->sg, ibmr->sg_len,
1255-
DMA_BIDIRECTIONAL);
1256-
ibmr->sg_dma_len = 0;
1257-
}
1258-
ibmr->sg = NULL;
1259-
ibmr->sg_len = 0;
1260-
return ret;
1261-
}
1262-
1263-
static int rds_ib_fastreg_inv(struct rds_ib_mr *ibmr)
1264-
{
1265-
struct ib_send_wr s_wr, *failed_wr;
1266-
int ret = 0;
1267-
1268-
down_read(&ibmr->device->fastreg_lock);
1269-
1270-
if (ibmr->fr_state != MR_IS_VALID)
1271-
goto out;
1272-
1273-
ibmr->fr_state = MR_IS_INVALID;
1274-
1275-
memset(&s_wr, 0, sizeof(s_wr));
1276-
s_wr.wr_id = (u64)ibmr;
1277-
s_wr.opcode = IB_WR_LOCAL_INV;
1278-
s_wr.ex.invalidate_rkey = ibmr->mr->rkey;
1279-
s_wr.send_flags = IB_SEND_SIGNALED;
1280-
1281-
failed_wr = &s_wr;
1282-
ret = ib_post_send(ibmr->device->fastreg_qp, &s_wr, &failed_wr);
1283-
BUG_ON(failed_wr != &s_wr);
1284-
if (ret) {
1285-
ibmr->fr_state = MR_IS_STALE;
1286-
pr_warn_ratelimited("RDS/IB: %s:%d ib_post_send returned %d\n",
1287-
__func__, __LINE__, ret);
1288-
goto out;
1289-
}
1290-
1291-
wait_for_completion(&ibmr->wr_comp);
1292-
out:
1293-
up_read(&ibmr->device->fastreg_lock);
1274+
if (sg_dma_len)
1275+
ib_dma_unmap_sg(rds_ibdev->dev, sg, sg_len, DMA_BIDIRECTIONAL);
12941276
return ret;
12951277
}
12961278

12971279
void rds_ib_fcq_handler(struct rds_ib_device *rds_ibdev, struct ib_wc *wc)
12981280
{
1299-
struct rds_ib_mr *ibmr = (struct rds_ib_mr *)wc->wr_id;
1300-
enum rds_ib_fr_state fr_state = ibmr->fr_state;
1281+
struct rds_ib_mr *ibmr;
1282+
1283+
if (wc->wr_id == RDS_MR_INV_WR_ID)
1284+
return;
1285+
ibmr = (struct rds_ib_mr *)wc->wr_id;
13011286

13021287
WARN_ON(ibmr->fr_state == MR_IS_STALE);
1288+
WARN_ON(ibmr->fr_state == MR_IS_INVALID);
13031289

13041290
if (wc->status != IB_WC_SUCCESS) {
13051291
pr_warn("RDS: IB: MR completion on fastreg qp status %u vendor_err %u\n",
@@ -1308,20 +1294,20 @@ void rds_ib_fcq_handler(struct rds_ib_device *rds_ibdev, struct ib_wc *wc)
13081294
queue_work(rds_wq, &rds_ibdev->fastreg_reset_w);
13091295
}
13101296

1311-
if (fr_state == MR_IS_INVALID) {
1312-
complete(&ibmr->wr_comp);
1313-
} else if (fr_state == MR_IS_VALID) {
1314-
atomic_inc(&rds_ibdev->fastreg_wrs);
1315-
complete(&ibmr->wr_comp);
1316-
}
1297+
atomic_add(2, &rds_ibdev->fastreg_wrs);
1298+
complete(&ibmr->wr_comp);
13171299
}
13181300

13191301
void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
13201302
{
1321-
struct rds_ib_mr *ibmr = (struct rds_ib_mr *)wc->wr_id;
1322-
enum rds_ib_fr_state fr_state = ibmr->fr_state;
1303+
struct rds_ib_mr *ibmr;
1304+
1305+
if (wc->wr_id == RDS_MR_INV_WR_ID)
1306+
return;
1307+
ibmr = (struct rds_ib_mr *)wc->wr_id;
13231308

13241309
WARN_ON(ibmr->fr_state == MR_IS_STALE);
1310+
WARN_ON(ibmr->fr_state == MR_IS_INVALID);
13251311

13261312
if (wc->status != IB_WC_SUCCESS) {
13271313
if (rds_conn_up(ic->conn)) {
@@ -1333,10 +1319,6 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
13331319
ibmr->fr_state = MR_IS_STALE;
13341320
}
13351321

1336-
if (fr_state == MR_IS_INVALID) {
1337-
complete(&ibmr->wr_comp);
1338-
} else if (fr_state == MR_IS_VALID) {
1339-
atomic_inc(&ic->i_fastreg_wrs);
1340-
complete(&ibmr->wr_comp);
1341-
}
1322+
atomic_add(2, &ic->i_fastreg_wrs);
1323+
complete(&ibmr->wr_comp);
13421324
}

0 commit comments

Comments
 (0)