shijin-aws · shijin-aws · Mar 17, 2023 · Mar 13, 2023 · Mar 13, 2023 · Mar 13, 2023
diff --git a/include/ofi_hmem.h b/include/ofi_hmem.h
@@ -216,6 +216,7 @@ int neuron_hmem_init(void);
 int neuron_hmem_cleanup(void);
 void *neuron_alloc(void **handle, size_t size);
 void neuron_free(void **handle);
+int neuron_get_dmabuf_fd(uint64_t va, uint64_t size, int* fd);
 
 int synapseai_init(void);
 int synapseai_cleanup(void);

diff --git a/man/fi_endpoint.3.md b/man/fi_endpoint.3.md
@@ -573,6 +573,9 @@ The following option levels and option names and parameters are defined.
   making such calls, user can achieve that by set this option to false.
   If an endpoint's support of CUDA memory relies on making calls to CUDA API,
   it will return -FI_EOPNOTSUPP for the call to fi_setopt.
+  If either CUDA library or CUDA device is not available, endpoint will
+  return -FI_EINVAL.
+  All providers that support FI_HMEM capability implement this option.
 
 ## fi_tc_dscp_set
 

diff --git a/man/fi_verbs.7.md b/man/fi_verbs.7.md
@@ -152,7 +152,7 @@ The support for fork in the provider has the following limitations:
 
 ### XRC Transport
 The XRC transport is intended to be used when layered with the RXM provider and
-requires the use of shared receive contexts. See [`fi_rxm`(7)](fi_rxm.7.thml).
+requires the use of shared receive contexts. See [`fi_rxm`(7)](fi_rxm.7.html).
 To enable XRC, the following environment variables must usually be set:
 FI_VERBS_PREFER_XRC and FI_OFI_RXM_USE_SRX.
 

diff --git a/man/man3/fi_endpoint.3 b/man/man3/fi_endpoint.3
@@ -1,6 +1,6 @@
 .\" Automatically generated by Pandoc 2.9.2.1
 .\"
-.TH "fi_endpoint" "3" "2023\-03\-08" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
+.TH "fi_endpoint" "3" "2023\-03\-15" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -639,6 +639,9 @@ If user wish to prohibit an endpoint from making such calls, user can
 achieve that by set this option to false.
 If an endpoint\[cq]s support of CUDA memory relies on making calls to
 CUDA API, it will return -FI_EOPNOTSUPP for the call to fi_setopt.
+If either CUDA library or CUDA device is not available, endpoint will
+return -FI_EINVAL.
+All providers that support FI_HMEM capability implement this option.
 .RE
 .SS fi_tc_dscp_set
 .PP

diff --git a/prov/efa/src/efa_domain.c b/prov/efa/src/efa_domain.c
@@ -213,9 +213,6 @@ int efa_domain_open(struct fid_fabric *fabric_fid, struct fi_info *info,
 	/* Check the value of environment variable FI_EFA_USE_DEVICE_RDMA */
 	efa_domain->use_device_rdma = rxr_env_get_use_device_rdma();
 
-	/* Check the value of environment variable FI_HMEM_CUDA_ENABLE_XFER */
-	efa_domain->cuda_xfer_setting = cuda_get_xfer_setting();
-
 	efa_domain->mr_local = ofi_mr_local(info);
 	if (EFA_EP_TYPE_IS_DGRAM(info) && !efa_domain->mr_local) {
 		EFA_WARN(FI_LOG_EP_DATA, "dgram require FI_MR_LOCAL, but application does not support it\n");

diff --git a/prov/efa/src/efa_domain.h b/prov/efa/src/efa_domain.h
@@ -57,7 +57,6 @@ struct efa_domain {
 	uint64_t		rdm_mode;
 	size_t			rdm_cq_size;
 	int	                use_device_rdma;
-	enum cuda_xfer_setting  cuda_xfer_setting;
 	struct dlist_entry	list_entry; /* linked to g_efa_domain_list */
 };
 

diff --git a/prov/efa/src/efa_hmem.c b/prov/efa/src/efa_hmem.c
@@ -164,11 +164,13 @@ static int efa_domain_hmem_info_init_cuda(struct efa_domain *efa_domain)
 
 	info->p2p_disabled_by_user = false;
 
-	/*
-	 * Require p2p for FI_HMEM_CUDA unless the user exlipictly enables
-	 * FI_HMEM_CUDA_ENABLE_XFER
+	/* If user is using libfabric API 1.18 or later, by default EFA provider is permitted to
+	 * use CUDA library to support CUDA memory, therefore p2p is not required.
 	 */
-	info->p2p_required_by_impl = efa_domain->cuda_xfer_setting != CUDA_XFER_ENABLED;
+	if (FI_VERSION_GE(efa_domain->util_domain.fabric->fabric_fid.api_version, FI_VERSION(1,18)))
+		info->p2p_required_by_impl = !hmem_ops[FI_HMEM_CUDA].initialized;
+	else
+		info->p2p_required_by_impl = true;
 
 	ibv_mr = ibv_reg_mr(g_device_list[0].ibv_pd, ptr, len, ibv_access);
 	if (!ibv_mr) {
@@ -214,11 +216,12 @@ static int efa_domain_hmem_info_init_neuron(struct efa_domain *efa_domain)
 {
 #if HAVE_NEURON
 	struct efa_hmem_info *info = &efa_domain->hmem_info[FI_HMEM_NEURON];
-	struct ibv_mr *ibv_mr;
+	struct ibv_mr *ibv_mr = NULL;
 	int ibv_access = IBV_ACCESS_LOCAL_WRITE;
 	void *handle;
 	void *ptr = NULL;
 	size_t len = ofi_get_page_size() * 2, tmp_value;
+	int dmabuf_fd;
 	int ret;
 
 	if (!ofi_hmem_is_initialized(FI_HMEM_NEURON)) {
@@ -250,7 +253,18 @@ static int efa_domain_hmem_info_init_neuron(struct efa_domain *efa_domain)
 	/* Neuron currently requires P2P */
 	info->p2p_required_by_impl = true;
 
-	ibv_mr = ibv_reg_mr(g_device_list[0].ibv_pd, ptr, len, ibv_access);
+	ret = neuron_get_dmabuf_fd((uint64_t)ptr, (uint64_t)len, &dmabuf_fd);
+	if (ret == FI_SUCCESS) {
+		ibv_mr = ibv_reg_dmabuf_mr(
+					g_device_list[0].ibv_pd, 0,
+					len, (uint64_t)ptr, dmabuf_fd, ibv_access);
+	} else if (ret == -FI_ENOPROTOOPT) {
+		EFA_INFO(FI_LOG_MR,
+			"Unable to retrieve dmabuf fd of Neuron device buffer, "
+			"Fall back to ibv_reg_mr\n");
+		ibv_mr = ibv_reg_mr(g_device_list[0].ibv_pd, ptr, len, ibv_access);
+	}
+
 	if (!ibv_mr) {
 		info->p2p_supported_by_device = false;
 		/* We do not expect to support Neuron on non p2p systems */

diff --git a/prov/efa/src/efa_mr.c b/prov/efa/src/efa_mr.c
@@ -262,9 +262,8 @@ static int efa_mr_hmem_setup(struct efa_mr *efa_mr,
 				EFA_WARN(FI_LOG_MR,
 					 "Unable to register handle for GPU memory. err: %d buf: %p len: %zu\n",
 					 err, attr->mr_iov->iov_base, attr->mr_iov->iov_len);
-				/* When gdrcopy pin buf failed, fallback to cudaMemcpy when user enables cuda xfer */
-				if (efa_mr->domain->cuda_xfer_setting != CUDA_XFER_ENABLED)
-					return err;
+				/* When gdrcopy pin buf failed, fallback to cudaMemcpy */
+				efa_mr->peer.use_gdrcopy = false;
 				efa_mr->peer.device.cuda = attr->device.cuda;
 			} else {
 				efa_mr->peer.use_gdrcopy = true;
@@ -572,6 +571,50 @@ static struct ibv_mr *efa_mr_reg_ibv_mr(struct efa_mr *efa_mr, struct fi_mr_attr
 				mr_attr->mr_iov->iov_len, access);
 	}
 }
+#elif HAVE_NEURON
+/**
+ * @brief Register a memory buffer with rdma-core api.
+ *
+ * @param efa_mr the ptr to the efa_mr object
+ * @param mr_attr the ptr to the fi_mr_attr object
+ * @param access the desired memory protection attributes
+ * @return struct ibv_mr* the ptr to the registered MR
+ */
+static struct ibv_mr *efa_mr_reg_ibv_mr(struct efa_mr *efa_mr, struct fi_mr_attr *mr_attr, int access)
+{
+	if (!efa_mr_is_neuron(efa_mr)) {
+		return ibv_reg_mr(
+			efa_mr->domain->ibv_pd,
+			(void *)mr_attr->mr_iov->iov_base,
+			mr_attr->mr_iov->iov_len, access);
+	}
+
+	int dmabuf_fd, ret;
+	ret = neuron_get_dmabuf_fd(
+			(uint64_t) mr_attr->mr_iov->iov_base,
+			(uint64_t) mr_attr->mr_iov->iov_len,
+			&dmabuf_fd);
+
+	if (ret == FI_SUCCESS) {
+		/* Success => invoke ibv_reg_dmabuf_mr */
+		return ibv_reg_dmabuf_mr(
+				efa_mr->domain->ibv_pd, 0,
+				mr_attr->mr_iov->iov_len,
+				(uint64_t)mr_attr->mr_iov->iov_base,
+				dmabuf_fd, access);
+	} else if (ret == -FI_ENOPROTOOPT) {
+		/* Protocol not availabe => fallback */
+		EFA_INFO(FI_LOG_MR,
+			"Unable to get dmabuf fd for Neuron device buffer, "
+			"Fall back to ibv_reg_mr\n");
+		return ibv_reg_mr(
+			efa_mr->domain->ibv_pd,
+			(void *)mr_attr->mr_iov->iov_base,
+			mr_attr->mr_iov->iov_len, access);
+	}
+
+	return NULL;
+}
 #else
 /**
  * @brief Register a memory buffer with rdma-core api.
@@ -587,7 +630,7 @@ static struct ibv_mr *efa_mr_reg_ibv_mr(struct efa_mr *efa_mr, struct fi_mr_attr
 			(void *)mr_attr->mr_iov->iov_base,
 			mr_attr->mr_iov->iov_len, access);
 }
-#endif /* HAVE_SYNAPSEAI */
+#endif
 
 #if HAVE_CUDA
 static inline