http://keeneland.gatech.edu/software/gpudirect
https://www.olcf.ornl.gov/tutorials/gpudirect-mpich-enabled-cuda/
CUDA C
direct.cpp
#include <stdio.h>#include <stdlib.h>#include <cuda_runtime.h>#include <mpi.h>int main( int argc, char** argv ){    MPI_Init (&argc, &argv);    int direct;    int rank, size;    int *h_buff = NULL;    int *d_rank = NULL;    int *d_buff = NULL;    size_t bytes;    int i;    // Ensure that RDMA ENABLED CUDA is set correctly    direct = getenv("MPICH_RDMA_ENABLED_CUDA")==NULL?0:atoi(getenv ("MPICH_RDMA_ENABLED_CUDA"));    if(direct != 1){        printf ("MPICH_RDMA_ENABLED_CUDA not enabled!n");        exit (EXIT_FAILURE);    }    // Get MPI rank and size    MPI_Comm_rank (MPI_COMM_WORLD, &rank);    MPI_Comm_size (MPI_COMM_WORLD, &size);    // Allocate host and device buffers and copy rank value to GPU    bytes = size*sizeof(int);    h_buff = (int*)malloc(bytes);    cudaMalloc(&d_buff, bytes);    cudaMalloc(&d_rank, sizeof(int));    cudaMemcpy(d_rank, &rank, sizeof(int), cudaMemcpyHostToDevice);    // Preform Allgather using device buffer    MPI_Allgather(d_rank, 1, MPI_INT, d_buff, 1, MPI_INT, MPI_COMM_WORLD);    // Check that the GPU buffer is correct    cudaMemcpy(h_buff, d_buff, bytes, cudaMemcpyDeviceToHost);    for(i=0; i<size; i++){        if(h_buff[i] != i) {            printf ("Alltoall Failed!n");            exit (EXIT_FAILURE);        }    }    if(rank==0)        printf("Success!n");    // Clean up    free(h_buff);    cudaFree(d_buff);    cudaFree(d_rank);    MPI_Finalize();    return 0;} |
Compiling
For ease of compiling the GNU environment will be used.
$ module load cudatoolkit$ module switch PrgEnv-pgi PrgEnv-gnu$ CC -lcudart direct.cpp -o direct.out |
Running
$ export LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH$ export MPICH_RDMA_ENABLED_CUDA=1$ aprun -n2 -N1 ./direct.out |
GPUDirect: CUDA aware MPI