{"id":1406,"date":"2015-05-18T15:19:33","date_gmt":"2015-05-18T20:19:33","guid":{"rendered":"http:\/\/jungwon.kim\/blog\/?p=1406"},"modified":"2015-05-18T15:19:33","modified_gmt":"2015-05-18T20:19:33","slug":"gpudirect-cuda-aware-mpi","status":"publish","type":"post","link":"https:\/\/blog.jungwon.kim\/?p=1406","title":{"rendered":"GPUDirect: CUDA aware MPI"},"content":{"rendered":"<p><a href=\"http:\/\/keeneland.gatech.edu\/software\/gpudirect\">http:\/\/keeneland.gatech.edu\/software\/gpudirect<\/a><\/p>\n<p><a href=\"https:\/\/www.olcf.ornl.gov\/tutorials\/gpudirect-mpich-enabled-cuda\/\">https:\/\/www.olcf.ornl.gov\/tutorials\/gpudirect-mpich-enabled-cuda\/<\/a><\/p>\n<h3>CUDA C<\/h3>\n<h4>direct.cpp<\/h4>\n<div>\n<div id=\"highlighter_987984\" class=\"syntaxhighlighter  cpp\">\n<table border=\"0\" cellspacing=\"0\" cellpadding=\"0\">\n<tbody>\n<tr>\n<td class=\"code\">\n<div class=\"container\">\n<div class=\"line number1 index0 alt2\"><code class=\"cpp preprocessor\">#include &lt;stdio.h&gt;<\/code><\/div>\n<div class=\"line number2 index1 alt1\"><code class=\"cpp preprocessor\">#include &lt;stdlib.h&gt;<\/code><\/div>\n<div class=\"line number3 index2 alt2\"><code class=\"cpp preprocessor\">#include &lt;cuda_runtime.h&gt;<\/code><\/div>\n<div class=\"line number4 index3 alt1\"><code class=\"cpp preprocessor\">#include &lt;mpi.h&gt;<\/code><\/div>\n<div class=\"line number5 index4 alt2\"><\/div>\n<div class=\"line number6 index5 alt1\"><code class=\"cpp color1 bold\">int<\/code> <code class=\"cpp plain\">main( <\/code><code class=\"cpp color1 bold\">int<\/code> <code class=\"cpp plain\">argc, <\/code><code class=\"cpp color1 bold\">char<\/code><code class=\"cpp plain\">** argv )<\/code><\/div>\n<div class=\"line number7 index6 alt2\"><code class=\"cpp plain\">{<\/code><\/div>\n<div class=\"line number8 index7 alt1\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp plain\">MPI_Init (&amp;argc, &amp;argv);<\/code><\/div>\n<div class=\"line number9 index8 alt2\"><\/div>\n<div class=\"line number10 index9 alt1\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp color1 bold\">int<\/code> <code class=\"cpp plain\">direct;<\/code><\/div>\n<div class=\"line number11 index10 alt2\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp color1 bold\">int<\/code> <code class=\"cpp plain\">rank, size;<\/code><\/div>\n<div class=\"line number12 index11 alt1\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp color1 bold\">int<\/code> <code class=\"cpp plain\">*h_buff = NULL;<\/code><\/div>\n<div class=\"line number13 index12 alt2\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp color1 bold\">int<\/code> <code class=\"cpp plain\">*d_rank = NULL;<\/code><\/div>\n<div class=\"line number14 index13 alt1\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp color1 bold\">int<\/code> <code class=\"cpp plain\">*d_buff = NULL;<\/code><\/div>\n<div class=\"line number15 index14 alt2\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp color1 bold\">size_t<\/code> <code class=\"cpp plain\">bytes;<\/code><\/div>\n<div class=\"line number16 index15 alt1\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp color1 bold\">int<\/code> <code class=\"cpp plain\">i;<\/code><\/div>\n<div class=\"line number17 index16 alt2\"><\/div>\n<div class=\"line number18 index17 alt1\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp comments\">\/\/ Ensure that RDMA ENABLED CUDA is set correctly<\/code><\/div>\n<div class=\"line number19 index18 alt2\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp plain\">direct = <\/code><code class=\"cpp functions bold\">getenv<\/code><code class=\"cpp plain\">(<\/code><code class=\"cpp string\">\"MPICH_RDMA_ENABLED_CUDA\"<\/code><code class=\"cpp plain\">)==NULL?0:<\/code><code class=\"cpp functions bold\">atoi<\/code><code class=\"cpp plain\">(<\/code><code class=\"cpp functions bold\">getenv<\/code> <code class=\"cpp plain\">(<\/code><code class=\"cpp string\">\"MPICH_RDMA_ENABLED_CUDA\"<\/code><code class=\"cpp plain\">));<\/code><\/div>\n<div class=\"line number20 index19 alt1\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp keyword bold\">if<\/code><code class=\"cpp plain\">(direct != 1){<\/code><\/div>\n<div class=\"line number21 index20 alt2\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp functions bold\">printf<\/code> <code class=\"cpp plain\">(<\/code><code class=\"cpp string\">\"MPICH_RDMA_ENABLED_CUDA not enabled!n\"<\/code><code class=\"cpp plain\">);<\/code><\/div>\n<div class=\"line number22 index21 alt1\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp functions bold\">exit<\/code> <code class=\"cpp plain\">(EXIT_FAILURE);<\/code><\/div>\n<div class=\"line number23 index22 alt2\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp plain\">}<\/code><\/div>\n<div class=\"line number24 index23 alt1\"><\/div>\n<div class=\"line number25 index24 alt2\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp comments\">\/\/ Get MPI rank and size<\/code><\/div>\n<div class=\"line number26 index25 alt1\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp plain\">MPI_Comm_rank (MPI_COMM_WORLD, &amp;rank);<\/code><\/div>\n<div class=\"line number27 index26 alt2\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp plain\">MPI_Comm_size (MPI_COMM_WORLD, &amp;size);<\/code><\/div>\n<div class=\"line number28 index27 alt1\"><\/div>\n<div class=\"line number29 index28 alt2\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp comments\">\/\/ Allocate host and device buffers and copy rank value to GPU<\/code><\/div>\n<div class=\"line number30 index29 alt1\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp plain\">bytes = size*<\/code><code class=\"cpp keyword bold\">sizeof<\/code><code class=\"cpp plain\">(<\/code><code class=\"cpp color1 bold\">int<\/code><code class=\"cpp plain\">);<\/code><\/div>\n<div class=\"line number31 index30 alt2\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp plain\">h_buff = (<\/code><code class=\"cpp color1 bold\">int<\/code><code class=\"cpp plain\">*)<\/code><code class=\"cpp functions bold\">malloc<\/code><code class=\"cpp plain\">(bytes);<\/code><\/div>\n<div class=\"line number32 index31 alt1\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp plain\">cudaMalloc(&amp;d_buff, bytes);<\/code><\/div>\n<div class=\"line number33 index32 alt2\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp plain\">cudaMalloc(&amp;d_rank, <\/code><code class=\"cpp keyword bold\">sizeof<\/code><code class=\"cpp plain\">(<\/code><code class=\"cpp color1 bold\">int<\/code><code class=\"cpp plain\">));<\/code><\/div>\n<div class=\"line number34 index33 alt1\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp plain\">cudaMemcpy(d_rank, &amp;rank, <\/code><code class=\"cpp keyword bold\">sizeof<\/code><code class=\"cpp plain\">(<\/code><code class=\"cpp color1 bold\">int<\/code><code class=\"cpp plain\">), cudaMemcpyHostToDevice);<\/code><\/div>\n<div class=\"line number35 index34 alt2\"><\/div>\n<div class=\"line number36 index35 alt1\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp comments\">\/\/ Preform Allgather using device buffer<\/code><\/div>\n<div class=\"line number37 index36 alt2\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp plain\">MPI_Allgather(d_rank, 1, MPI_INT, d_buff, 1, MPI_INT, MPI_COMM_WORLD);<\/code><\/div>\n<div class=\"line number38 index37 alt1\"><\/div>\n<div class=\"line number39 index38 alt2\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp comments\">\/\/ Check that the GPU buffer is correct<\/code><\/div>\n<div class=\"line number40 index39 alt1\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp plain\">cudaMemcpy(h_buff, d_buff, bytes, cudaMemcpyDeviceToHost);<\/code><\/div>\n<div class=\"line number41 index40 alt2\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp keyword bold\">for<\/code><code class=\"cpp plain\">(i=0; i&lt;size; i++){<\/code><\/div>\n<div class=\"line number42 index41 alt1\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp keyword bold\">if<\/code><code class=\"cpp plain\">(h_buff[i] != i) {<\/code><\/div>\n<div class=\"line number43 index42 alt2\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp functions bold\">printf<\/code> <code class=\"cpp plain\">(<\/code><code class=\"cpp string\">\"Alltoall Failed!n\"<\/code><code class=\"cpp plain\">);<\/code><\/div>\n<div class=\"line number44 index43 alt1\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp functions bold\">exit<\/code> <code class=\"cpp plain\">(EXIT_FAILURE);<\/code><\/div>\n<div class=\"line number45 index44 alt2\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp plain\">}<\/code><\/div>\n<div class=\"line number46 index45 alt1\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp plain\">}<\/code><\/div>\n<div class=\"line number47 index46 alt2\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp keyword bold\">if<\/code><code class=\"cpp plain\">(rank==0)<\/code><\/div>\n<div class=\"line number48 index47 alt1\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp functions bold\">printf<\/code><code class=\"cpp plain\">(<\/code><code class=\"cpp string\">\"Success!n\"<\/code><code class=\"cpp plain\">);<\/code><\/div>\n<div class=\"line number49 index48 alt2\"><\/div>\n<div class=\"line number50 index49 alt1\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp comments\">\/\/ Clean up<\/code><\/div>\n<div class=\"line number51 index50 alt2\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp functions bold\">free<\/code><code class=\"cpp plain\">(h_buff);<\/code><\/div>\n<div class=\"line number52 index51 alt1\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp plain\">cudaFree(d_buff);<\/code><\/div>\n<div class=\"line number53 index52 alt2\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp plain\">cudaFree(d_rank);<\/code><\/div>\n<div class=\"line number54 index53 alt1\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp plain\">MPI_Finalize();<\/code><\/div>\n<div class=\"line number55 index54 alt2\"><\/div>\n<div class=\"line number56 index55 alt1\"><code class=\"cpp spaces\">\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0<\/code><code class=\"cpp keyword bold\">return<\/code> <code class=\"cpp plain\">0;<\/code><\/div>\n<div class=\"line number57 index56 alt2\"><code class=\"cpp plain\">}<\/code><\/div>\n<\/div>\n<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<\/div>\n<\/div>\n<h4>Compiling<\/h4>\n<p>For ease of compiling the GNU environment will be used.<\/p>\n<div>\n<div id=\"highlighter_436832\" class=\"syntaxhighlighter nogutter  plain\">\n<table border=\"0\" cellspacing=\"0\" cellpadding=\"0\">\n<tbody>\n<tr>\n<td class=\"code\">\n<div class=\"container\">\n<div class=\"line number1 index0 alt2\"><code class=\"plain plain\">$ module load cudatoolkit<\/code><\/div>\n<div class=\"line number2 index1 alt1\"><code class=\"plain plain\">$ module switch PrgEnv-pgi PrgEnv-gnu<\/code><\/div>\n<div class=\"line number3 index2 alt2\"><code class=\"plain plain\">$ CC -lcudart direct.cpp -o direct.out<\/code><\/div>\n<\/div>\n<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<\/div>\n<\/div>\n<h4>Running<\/h4>\n<div>\n<div id=\"highlighter_291708\" class=\"syntaxhighlighter nogutter  plain\">\n<table border=\"0\" cellspacing=\"0\" cellpadding=\"0\">\n<tbody>\n<tr>\n<td class=\"code\">\n<div class=\"container\">\n<div class=\"line number1 index0 alt2\"><code class=\"plain plain\">$ export LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH<\/code><\/div>\n<div class=\"line number2 index1 alt1\"><code class=\"plain plain\">$ export MPICH_RDMA_ENABLED_CUDA=1<\/code><\/div>\n<div class=\"line number3 index2 alt2\"><code class=\"plain plain\">$ aprun -n2 -N1 .\/direct.out<\/code><\/div>\n<\/div>\n<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<\/div>\n<\/div>\n","protected":false},"excerpt":{"rendered":"<p>http:\/\/keeneland.gatech.edu\/software\/gpudirect https:\/\/www.olcf.ornl.gov\/tutorials\/gpudirect-mpich-enabled-cuda\/ CUDA C direct.cpp #include &lt;stdio.h&gt; #include &lt;stdlib.h&gt; #include &lt;cuda_runtime.h&gt; #include &lt;mpi.h&gt; int main( int argc, char** argv ) { \u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0MPI_Init (&amp;argc, &amp;argv); \u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0int direct; \u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0int rank, size; \u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0int *h_buff = NULL; \u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0int *d_rank = NULL; \u00c2\u00a0\u00c2\u00a0\u00c2\u00a0\u00c2\u00a0int *d_buff =<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"closed","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[4],"tags":[],"class_list":["post-1406","post","type-post","status-publish","format-standard","hentry","category-cs"],"_links":{"self":[{"href":"https:\/\/blog.jungwon.kim\/index.php?rest_route=\/wp\/v2\/posts\/1406","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/blog.jungwon.kim\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/blog.jungwon.kim\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/blog.jungwon.kim\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/blog.jungwon.kim\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=1406"}],"version-history":[{"count":0,"href":"https:\/\/blog.jungwon.kim\/index.php?rest_route=\/wp\/v2\/posts\/1406\/revisions"}],"wp:attachment":[{"href":"https:\/\/blog.jungwon.kim\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=1406"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/blog.jungwon.kim\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=1406"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/blog.jungwon.kim\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=1406"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}