http://keeneland.gatech.edu/software/gpudirect

https://www.olcf.ornl.gov/tutorials/gpudirect-mpich-enabled-cuda/

CUDA C

direct.cpp

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <mpi.h>
int main( int argc, char** argv )
{
    MPI_Init (&argc, &argv);
    int direct;
    int rank, size;
    int *h_buff = NULL;
    int *d_rank = NULL;
    int *d_buff = NULL;
    size_t bytes;
    int i;
    // Ensure that RDMA ENABLED CUDA is set correctly
    direct = getenv("MPICH_RDMA_ENABLED_CUDA")==NULL?0:atoi(getenv ("MPICH_RDMA_ENABLED_CUDA"));
    if(direct != 1){
        printf ("MPICH_RDMA_ENABLED_CUDA not enabled!n");
        exit (EXIT_FAILURE);
    }
    // Get MPI rank and size
    MPI_Comm_rank (MPI_COMM_WORLD, &rank);
    MPI_Comm_size (MPI_COMM_WORLD, &size);
    // Allocate host and device buffers and copy rank value to GPU
    bytes = size*sizeof(int);
    h_buff = (int*)malloc(bytes);
    cudaMalloc(&d_buff, bytes);
    cudaMalloc(&d_rank, sizeof(int));
    cudaMemcpy(d_rank, &rank, sizeof(int), cudaMemcpyHostToDevice);
    // Preform Allgather using device buffer
    MPI_Allgather(d_rank, 1, MPI_INT, d_buff, 1, MPI_INT, MPI_COMM_WORLD);
    // Check that the GPU buffer is correct
    cudaMemcpy(h_buff, d_buff, bytes, cudaMemcpyDeviceToHost);
    for(i=0; i<size; i++){
        if(h_buff[i] != i) {
            printf ("Alltoall Failed!n");
            exit (EXIT_FAILURE);
        }
    }
    if(rank==0)
        printf("Success!n");
    // Clean up
    free(h_buff);
    cudaFree(d_buff);
    cudaFree(d_rank);
    MPI_Finalize();
    return 0;
}

Compiling

For ease of compiling the GNU environment will be used.

$ module load cudatoolkit
$ module switch PrgEnv-pgi PrgEnv-gnu
$ CC -lcudart direct.cpp -o direct.out

Running

$ export LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH
$ export MPICH_RDMA_ENABLED_CUDA=1
$ aprun -n2 -N1 ./direct.out
GPUDirect: CUDA aware MPI