This turned out to be a nasty little bug. Turns out there is place where the rxe driver is registering memory that uses are area of memory that is not available in the ARM processor we are using. Here's the patch that made it work...
2 files changed, 15 insertions(+), 2 deletions(-)
diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
index 5c2684b..f2dc5a7 100644
--- a/drivers/infiniband/sw/rxe/rxe_mr.c
+++ b/drivers/infiniband/sw/rxe/rxe_mr.c
@@ -31,6 +31,7 @@
* SOFTWARE.
*/
+#include <linux/highmem.h>
#include "rxe.h"
#include "rxe_loc.h"
@@ -94,7 +95,15 @@ static void rxe_mem_init(int access, struct rxe_mem *mem)
void rxe_mem_cleanup(struct rxe_pool_entry *arg)
{
struct rxe_mem *mem = container_of(arg, typeof(*mem), pelem);
- int i;
+ int i, entry;
+ struct scatterlist *sg;
+
+ if (mem->kmap_occurred) {
+ for_each_sg(mem->umem->sg_head.sgl, sg,
+ mem->umem->nmap, entry) {
+ kunmap(sg_page(sg));
+ }
+ }
if (mem->umem)
ib_umem_release(mem->umem);
@@ -200,12 +209,14 @@ int rxe_mem_init_user(struct rxe_dev *rxe, struct rxe_pd *pd, u64 start,
buf = map[0]->buf;
for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
- vaddr = page_address(sg_page(sg));
+ // vaddr = page_address(sg_page(sg));
+ vaddr = kmap(sg_page(sg));
if (!vaddr) {
pr_warn("null vaddr\n");
err = -ENOMEM;
goto err1;
}
+ mem->kmap_occurred = 1;
buf->addr = (uintptr_t)vaddr;
buf->size = BIT(umem->page_shift);
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
index af1470d..9bd7eac 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.h
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
@@ -343,6 +343,8 @@ struct rxe_mem {
u32 num_map;
struct rxe_map **map;
+
+ int kmap_occurred;
};
struct rxe_mc_grp {
--
2.7.4
The idea is that you need to use kmap()/kunmap() rather than page_address() to handle these memory regions that are being used by both the kernel and user memory to make this work on the ARM...
Thanks,
FM