diff --git a/config/opal_mca.m4 b/config/opal_mca.m4 index bb51d3bc5f1..7c597166289 100644 --- a/config/opal_mca.m4 +++ b/config/opal_mca.m4 @@ -186,7 +186,7 @@ of type-component pairs. For example, --enable-mca-no-build=pml-ob1]) else msg= if test -z "$enable_mca_dso"; then - enable_mca_dso="accelerator-cuda,accelerator-rocm,accelerator-ze" + enable_mca_dso="accelerator-cuda,accelerator-rocm,accelerator-ze,op-cuda,op-rocm" msg="(default)" fi DSO_all=0 diff --git a/ompi/mca/coll/acoll/coll_acoll_allreduce.c b/ompi/mca/coll/acoll/coll_acoll_allreduce.c index 3b40fef39f9..6da452ec719 100644 --- a/ompi/mca/coll/acoll/coll_acoll_allreduce.c +++ b/ompi/mca/coll/acoll/coll_acoll_allreduce.c @@ -483,7 +483,7 @@ int mca_coll_acoll_allreduce_intra(const void *sbuf, void *rbuf, size_t count, /* Falling back to recursivedoubling for non-commutative operators to be safe */ if (!ompi_op_is_commute(op)) { return ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf, count, dtype, op, comm, - module); + module, NULL); } /* Obtain the subcomms structure */ @@ -497,7 +497,7 @@ int mca_coll_acoll_allreduce_intra(const void *sbuf, void *rbuf, size_t count, /* Fallback to knomial if subc is not obtained */ if (NULL == subc) { return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype, op, comm, - module); + module, NULL); } if (!subc->initialized) { err = mca_coll_acoll_comm_split_init(comm, acoll_module, subc, 0); @@ -513,7 +513,7 @@ int mca_coll_acoll_allreduce_intra(const void *sbuf, void *rbuf, size_t count, if (num_nodes > 1) { if (total_dsize > 16384) { return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype, op, - comm, module); + comm, module, NULL); } int use_socket = acoll_module->use_socket != -1 ? acoll_module->use_socket : 0; coll_acoll_subcomms_t *soc_subc = NULL; @@ -525,7 +525,7 @@ int mca_coll_acoll_allreduce_intra(const void *sbuf, void *rbuf, size_t count, /* Validate communicator hierarchy before proceeding */ if (NULL == soc_comm || NULL == ldr_comm) { return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype, op, - comm, module); + comm, module, NULL); } err = check_and_create_subc(soc_comm, acoll_module, &soc_subc); @@ -573,10 +573,10 @@ int mca_coll_acoll_allreduce_intra(const void *sbuf, void *rbuf, size_t count, if (ompi_comm_size(ldr_comm) > 1 && -1 != ldr_root) { if ((MPI_IN_PLACE == sbuf)) { err = ompi_coll_base_allreduce_intra_recursivedoubling(MPI_IN_PLACE, rbuf, count, dtype, op, - ldr_comm, module); + ldr_comm, module, NULL); } else { err = ompi_coll_base_allreduce_intra_recursivedoubling(tmp_sbuf, rbuf, count, dtype, op, - ldr_comm, module); + ldr_comm, module, NULL); } if (MPI_SUCCESS != err) { if (NULL != inplacebuf_free) { @@ -607,23 +607,23 @@ int mca_coll_acoll_allreduce_intra(const void *sbuf, void *rbuf, size_t count, if (1 == num_nodes) { if (total_dsize < 32) { return ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf, count, dtype, op, - comm, module); + comm, module, NULL); } else if ((total_dsize < 512) && is_opt) { return mca_coll_acoll_allreduce_small_msgs_h(sbuf, rbuf, count, dtype, op, comm, module, subc, 1); } else if (total_dsize <= 2048) { return ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf, count, dtype, op, - comm, module); + comm, module, NULL); } else if (total_dsize < 65536) { if (1 == alg) { return ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf, count, dtype, - op, comm, module); + op, comm, module, NULL); } else if (2 == alg) { return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype, - op, comm, module); + op, comm, module, NULL); } else { /*3 == alg */ return ompi_coll_base_allreduce_intra_ring_segmented(sbuf, rbuf, count, dtype, op, - comm, module, 0); + comm, module, 0, NULL); } } else if (total_dsize < 4194304) { if (((0 != subc->smsc_use_sr_buf) || (subc->smsc_buf_size > 2 * total_dsize)) @@ -631,7 +631,7 @@ int mca_coll_acoll_allreduce_intra(const void *sbuf, void *rbuf, size_t count, return mca_coll_acoll_allreduce_smsc_f(sbuf, rbuf, count, dtype, op, comm, module, subc); } else { return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype, - op, comm, module); + op, comm, module, NULL); } } else if (total_dsize <= 16777216) { if (((0 != subc->smsc_use_sr_buf) || (subc->smsc_buf_size > 2 * total_dsize)) @@ -640,7 +640,7 @@ int mca_coll_acoll_allreduce_intra(const void *sbuf, void *rbuf, size_t count, return mca_coll_acoll_bcast(rbuf, count, dtype, 0, comm, module); } else { return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype, - op, comm, module); + op, comm, module, NULL); } } else { if (((0 != subc->smsc_use_sr_buf) || (subc->smsc_buf_size > 2 * total_dsize)) @@ -648,13 +648,13 @@ int mca_coll_acoll_allreduce_intra(const void *sbuf, void *rbuf, size_t count, return mca_coll_acoll_allreduce_smsc_f(sbuf, rbuf, count, dtype, op, comm, module, subc); } else { return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype, - op, comm, module); + op, comm, module, NULL); } } } else { return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype, op, comm, - module); + module, NULL); } return MPI_SUCCESS; } diff --git a/ompi/mca/coll/acoll/coll_acoll_reduce.c b/ompi/mca/coll/acoll/coll_acoll_reduce.c index 28fc3c62c6a..8fee8f7c2a2 100644 --- a/ompi/mca/coll/acoll/coll_acoll_reduce.c +++ b/ompi/mca/coll/acoll/coll_acoll_reduce.c @@ -360,11 +360,11 @@ int mca_coll_acoll_reduce_intra(const void *sbuf, void *rbuf, size_t count, /* Falling back to inorder binary for non-commutative operators to be safe */ if (!ompi_op_is_commute(op)) { return ompi_coll_base_reduce_intra_in_order_binary(sbuf, rbuf, count, dtype, op, root, comm, - module, 0, 0); + module, 0, 0, NULL); } if (0 != root) { // ToDo: support non-zero root return ompi_coll_base_reduce_intra_binomial(sbuf, rbuf, count, dtype, op, root, comm, - module, 0, 0); + module, 0, 0, NULL); } /* Disable shm/xpmem based optimizations if: */ @@ -396,7 +396,7 @@ int mca_coll_acoll_reduce_intra(const void *sbuf, void *rbuf, size_t count, /* Fallback to knomial if subc is not obtained */ if (NULL == subc) { return ompi_coll_base_reduce_intra_binomial(sbuf, rbuf, count, dtype, op, root, comm, - module, 0, 0); + module, 0, 0, NULL); } if (!subc->initialized || (root != subc->prev_init_root)) { @@ -422,10 +422,10 @@ int mca_coll_acoll_reduce_intra(const void *sbuf, void *rbuf, size_t count, comm, module); } else if (2 == alg) { return ompi_coll_base_reduce_intra_binomial(sbuf, rbuf, count, dtype, op, root, - comm, module, 0, 0); + comm, module, 0, 0, NULL); } else { /* either 3 == alg or acoll_module->red_algo is not 0, 1, 2*/ return ompi_coll_base_reduce_intra_in_order_binary(sbuf, rbuf, count, dtype, op, - root, comm, module, 0, 0); + root, comm, module, 0, 0, NULL); } } else { if ((((0 != subc->smsc_use_sr_buf) @@ -437,7 +437,7 @@ int mca_coll_acoll_reduce_intra(const void *sbuf, void *rbuf, size_t count, module, subc); } else { return ompi_coll_base_reduce_intra_binomial(sbuf, rbuf, count, dtype, op, - root, comm, module, 0, 0); + root, comm, module, 0, 0, NULL); } } } else { @@ -446,7 +446,7 @@ int mca_coll_acoll_reduce_intra(const void *sbuf, void *rbuf, size_t count, subc); } else { return ompi_coll_base_reduce_intra_binomial(sbuf, rbuf, count, dtype, op, root, comm, - module, 0, 0); + module, 0, 0, NULL); } } return MPI_SUCCESS; diff --git a/ompi/mca/coll/acoll/coll_acoll_utils.h b/ompi/mca/coll/acoll/coll_acoll_utils.h index 41d02381b5f..1fc9af3a931 100644 --- a/ompi/mca/coll/acoll/coll_acoll_utils.h +++ b/ompi/mca/coll/acoll/coll_acoll_utils.h @@ -33,6 +33,19 @@ extern int mca_coll_acoll_without_smsc; extern int mca_coll_acoll_smsc_use_sr_buf; extern int mca_coll_acoll_barrier_algo; +/* Wrapper so recursivedoubling can be stored as a module function pointer + * despite having gained an allocator parameter in coll_base_functions.h. */ +static int +ompi_coll_acoll_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf, size_t count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + return ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf, count, dtype, op, + comm, module, NULL); +} + /* * Hybrid backoff spin-wait with adaptive progress calls. * Optimized for intra-node shared memory synchronization. @@ -440,7 +453,7 @@ static inline int mca_coll_acoll_comm_split_init(ompi_communicator_t *comm, int rank = ompi_comm_rank(comm); (comm)->c_coll->coll_allgather = ompi_coll_base_allgather_intra_ring; - (comm)->c_coll->coll_allreduce = ompi_coll_base_allreduce_intra_recursivedoubling; + (comm)->c_coll->coll_allreduce = ompi_coll_acoll_allreduce_intra_recursivedoubling; (comm)->c_coll->coll_bcast = ompi_coll_base_bcast_intra_basic_linear; if (!subc->initialized) { OBJ_CONSTRUCT(&comm_info, opal_info_t); @@ -538,14 +551,14 @@ static inline int mca_coll_acoll_comm_split_init(ompi_communicator_t *comm, coll_bcast_loc = (subc->local_comm)->c_coll->coll_bcast; (subc->local_comm)->c_coll->coll_allgather = ompi_coll_base_allgather_intra_ring; (subc->local_comm)->c_coll->coll_allreduce - = ompi_coll_base_allreduce_intra_recursivedoubling; + = ompi_coll_acoll_allreduce_intra_recursivedoubling; (subc->local_comm)->c_coll->coll_bcast = ompi_coll_base_bcast_intra_basic_linear; coll_allreduce_soc = (subc->socket_comm)->c_coll->coll_allreduce; coll_allgather_soc = (subc->socket_comm)->c_coll->coll_allgather; coll_bcast_soc = (subc->socket_comm)->c_coll->coll_bcast; (subc->socket_comm)->c_coll->coll_allgather = ompi_coll_base_allgather_intra_ring; (subc->socket_comm)->c_coll->coll_allreduce - = ompi_coll_base_allreduce_intra_recursivedoubling; + = ompi_coll_acoll_allreduce_intra_recursivedoubling; (subc->socket_comm)->c_coll->coll_bcast = ompi_coll_base_bcast_intra_basic_linear; } diff --git a/ompi/mca/coll/base/coll_base_allgather.c b/ompi/mca/coll/base/coll_base_allgather.c index 6d9bd6fcfc3..a9480f09169 100644 --- a/ompi/mca/coll/base/coll_base_allgather.c +++ b/ompi/mca/coll/base/coll_base_allgather.c @@ -114,7 +114,7 @@ ompi_coll_base_allgather_intra_recursivedoubling(const void *sbuf, size_t scount int k = 2; return ompi_coll_base_allgather_intra_k_bruck(sbuf, scount, sdtype, rbuf, rcount, rdtype, - comm, module, k); + comm, module, k, NULL); } OPAL_OUTPUT((ompi_coll_base_framework.framework_output, @@ -771,7 +771,8 @@ int ompi_coll_base_allgather_intra_k_bruck(const void *sbuf, size_t scount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module, - int radix) + int radix, + mca_allocator_base_module_t *allocator) { int line = -1, rank, size, dst, src, err = MPI_SUCCESS; int recvcount, distance; @@ -796,7 +797,7 @@ int ompi_coll_base_allgather_intra_k_bruck(const void *sbuf, size_t scount, if (0 != rank) { /* Compute the temporary buffer size, including datatypes empty gaps */ rsize = opal_datatype_span(&rdtype->super, (size_t)rcount * (size - rank), &rgap); - tmp_buf = (char *) malloc(rsize); + tmp_buf = (char *) COLL_BASE_ALLOC(allocator, rsize); tmp_buf_start = tmp_buf - rgap; } @@ -891,7 +892,7 @@ int ompi_coll_base_allgather_intra_k_bruck(const void *sbuf, size_t scount, if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } - if(tmp_buf != NULL) free(tmp_buf); + if(tmp_buf != NULL) COLL_BASE_FREE(allocator, tmp_buf); return MPI_SUCCESS; err_hndl: @@ -911,7 +912,7 @@ int ompi_coll_base_allgather_intra_k_bruck(const void *sbuf, size_t scount, OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); if(tmp_buf != NULL) { - free(tmp_buf); + COLL_BASE_FREE(allocator, tmp_buf); tmp_buf = NULL; tmp_buf_start = NULL; } diff --git a/ompi/mca/coll/base/coll_base_allreduce.c b/ompi/mca/coll/base/coll_base_allreduce.c index 82d57328b2e..1d605258102 100644 --- a/ompi/mca/coll/base/coll_base_allreduce.c +++ b/ompi/mca/coll/base/coll_base_allreduce.c @@ -40,6 +40,7 @@ #include "ompi/mca/pml/pml.h" #include "ompi/op/op.h" #include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/op/op_gpu_session.h" #include "coll_base_topo.h" #include "coll_base_util.h" @@ -136,7 +137,8 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module, + ompi_op_gpu_session_t *session) { int ret, line, rank, size, adjsize, remote, distance; int newrank, newremote, extra_ranks; @@ -160,7 +162,7 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf, /* Allocate and initialize temporary send buffer */ span = opal_datatype_span(&dtype->super, count, &gap); - inplacebuf_free = (char*) malloc(span); + inplacebuf_free = (char*) COLL_SESSION_ALLOC(session, span); if (NULL == inplacebuf_free) { ret = -1; line = __LINE__; goto error_hndl; } inplacebuf = inplacebuf_free - gap; @@ -200,7 +202,7 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } /* tmpsend = tmprecv (op) tmpsend */ - ompi_op_reduce(op, tmprecv, tmpsend, count, dtype); + COLL_BASE_REDUCE(session, op, tmprecv, tmpsend, count, dtype); newrank = rank >> 1; } } else { @@ -230,13 +232,13 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf, /* Apply operation */ if (rank < remote) { /* tmprecv = tmpsend (op) tmprecv */ - ompi_op_reduce(op, tmpsend, tmprecv, count, dtype); + COLL_BASE_REDUCE(session, op, tmpsend, tmprecv, count, dtype); tmpswap = tmprecv; tmprecv = tmpsend; tmpsend = tmpswap; } else { /* tmpsend = tmprecv (op) tmpsend */ - ompi_op_reduce(op, tmprecv, tmpsend, count, dtype); + COLL_BASE_REDUCE(session, op, tmprecv, tmpsend, count, dtype); } } @@ -266,14 +268,14 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf, if (ret < 0) { line = __LINE__; goto error_hndl; } } - if (NULL != inplacebuf_free) free(inplacebuf_free); + COLL_SESSION_FREE(session, inplacebuf_free); return MPI_SUCCESS; error_hndl: OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tRank %d Error occurred %d\n", __FILE__, line, rank, ret)); (void)line; // silence compiler warning - if (NULL != inplacebuf_free) free(inplacebuf_free); + COLL_SESSION_FREE(session, inplacebuf_free); return ret; } @@ -346,7 +348,8 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, size_t count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module, + ompi_op_gpu_session_t *session) { int ret, line, rank, size, k, recv_from, send_to, block_count, inbi; int early_segcount, late_segcount, split_rank, max_segcount; @@ -377,7 +380,8 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, size_t count, return (ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf, count, dtype, op, - comm, module)); + comm, module, + session)); } /* Allocate and initialize temporary buffers */ @@ -401,10 +405,10 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, size_t count, max_real_segsize = true_extent + (max_segcount - 1) * extent; - inbuf[0] = (char*)malloc(max_real_segsize); + inbuf[0] = (char*)COLL_SESSION_ALLOC(session, max_real_segsize); if (NULL == inbuf[0]) { ret = -1; line = __LINE__; goto error_hndl; } if (size > 2) { - inbuf[1] = (char*)malloc(max_real_segsize); + inbuf[1] = (char*)COLL_SESSION_ALLOC(session, max_real_segsize); if (NULL == inbuf[1]) { ret = -1; line = __LINE__; goto error_hndl; } } @@ -472,7 +476,7 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, size_t count, ((ptrdiff_t)prevblock * late_segcount + split_rank)); block_count = ((prevblock < split_rank)? early_segcount : late_segcount); tmprecv = ((char*)rbuf) + (ptrdiff_t)block_offset * extent; - ompi_op_reduce(op, inbuf[inbi ^ 0x1], tmprecv, block_count, dtype); + COLL_BASE_REDUCE(session, op, inbuf[inbi ^ 0x1], tmprecv, block_count, dtype); /* send previous block to send_to */ ret = MCA_PML_CALL(send(tmprecv, block_count, dtype, send_to, @@ -493,7 +497,7 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, size_t count, ((ptrdiff_t)recv_from * late_segcount + split_rank)); block_count = ((recv_from < split_rank)? early_segcount : late_segcount); tmprecv = ((char*)rbuf) + (ptrdiff_t)block_offset * extent; - ompi_op_reduce(op, inbuf[inbi], tmprecv, block_count, dtype); + COLL_BASE_REDUCE(session, op, inbuf[inbi], tmprecv, block_count, dtype); /* Distribution loop - variation of ring allgather */ send_to = (rank + 1) % size; @@ -524,8 +528,8 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, size_t count, } - if (NULL != inbuf[0]) free(inbuf[0]); - if (NULL != inbuf[1]) free(inbuf[1]); + COLL_SESSION_FREE(session, inbuf[0]); + COLL_SESSION_FREE(session, inbuf[1]); return MPI_SUCCESS; @@ -534,8 +538,8 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, size_t count, __FILE__, line, rank, ret)); ompi_coll_base_free_reqs(reqs, 2); (void)line; // silence compiler warning - if (NULL != inbuf[0]) free(inbuf[0]); - if (NULL != inbuf[1]) free(inbuf[1]); + COLL_SESSION_FREE(session, inbuf[0]); + COLL_SESSION_FREE(session, inbuf[1]); return ret; } @@ -624,7 +628,8 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, size struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module, - uint32_t segsize) + uint32_t segsize, + ompi_op_gpu_session_t *session) { int ret, line, rank, size, k, recv_from, send_to; int early_blockcount, late_blockcount, split_rank; @@ -660,7 +665,7 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, size if (count < (size_t) (size * segcount)) { OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:allreduce_ring_segmented rank %d/%d, count %zu, switching to regular ring", rank, size, count)); return (ompi_coll_base_allreduce_intra_ring(sbuf, rbuf, count, dtype, op, - comm, module)); + comm, module, session)); } /* Determine the number of phases of the algorithm */ @@ -689,10 +694,10 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, size max_real_segsize = opal_datatype_span(&dtype->super, max_segcount, &gap); /* Allocate and initialize temporary buffers */ - inbuf[0] = (char*)malloc(max_real_segsize); + inbuf[0] = (char*)COLL_SESSION_ALLOC(session, max_real_segsize); if (NULL == inbuf[0]) { ret = -1; line = __LINE__; goto error_hndl; } if (size > 2) { - inbuf[1] = (char*)malloc(max_real_segsize); + inbuf[1] = (char*)COLL_SESSION_ALLOC(session, max_real_segsize); if (NULL == inbuf[1]) { ret = -1; line = __LINE__; goto error_hndl; } } @@ -783,7 +788,7 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, size ((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) : ((ptrdiff_t)phase * (ptrdiff_t)late_phase_segcount + split_phase)); tmprecv = ((char*)rbuf) + (ptrdiff_t)(block_offset + phase_offset) * extent; - ompi_op_reduce(op, inbuf[inbi ^ 0x1], tmprecv, phase_count, dtype); + COLL_BASE_REDUCE(session, op, inbuf[inbi ^ 0x1], tmprecv, phase_count, dtype); /* send previous block to send_to */ ret = MCA_PML_CALL(send(tmprecv, phase_count, dtype, send_to, @@ -812,7 +817,7 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, size ((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) : ((ptrdiff_t)phase * (ptrdiff_t)late_phase_segcount + split_phase)); tmprecv = ((char*)rbuf) + (ptrdiff_t)(block_offset + phase_offset) * extent; - ompi_op_reduce(op, inbuf[inbi], tmprecv, phase_count, dtype); + COLL_BASE_REDUCE(session, op, inbuf[inbi], tmprecv, phase_count, dtype); } /* Distribution loop - variation of ring allgather */ @@ -844,8 +849,8 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, size } - if (NULL != inbuf[0]) free(inbuf[0]); - if (NULL != inbuf[1]) free(inbuf[1]); + COLL_SESSION_FREE(session, inbuf[0]); + COLL_SESSION_FREE(session, inbuf[1]); return MPI_SUCCESS; @@ -854,8 +859,8 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, size __FILE__, line, rank, ret)); ompi_coll_base_free_reqs(reqs, 2); (void)line; // silence compiler warning - if (NULL != inbuf[0]) free(inbuf[0]); - if (NULL != inbuf[1]) free(inbuf[1]); + COLL_SESSION_FREE(session, inbuf[0]); + COLL_SESSION_FREE(session, inbuf[1]); return ret; } @@ -974,7 +979,7 @@ ompi_coll_base_allreduce_intra_basic_linear(const void *sbuf, void *rbuf, size_t int ompi_coll_base_allreduce_intra_redscat_allgather( const void *sbuf, void *rbuf, size_t count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module, ompi_op_gpu_session_t *session) { int *rindex = NULL, *rcount = NULL, *sindex = NULL, *scount = NULL; @@ -1006,7 +1011,7 @@ int ompi_coll_base_allreduce_intra_redscat_allgather( /* Temporary buffer for receiving messages */ char *tmp_buf = NULL; - char *tmp_buf_raw = (char *)malloc(dsize); + char *tmp_buf_raw = (char *)COLL_SESSION_ALLOC(session, dsize); if (NULL == tmp_buf_raw) return OMPI_ERR_OUT_OF_RESOURCE; tmp_buf = tmp_buf_raw - gap; @@ -1056,8 +1061,8 @@ int ompi_coll_base_allreduce_intra_redscat_allgather( if (MPI_SUCCESS != err) { goto cleanup_and_return; } /* Reduce on the right half of the buffers (result in rbuf) */ - ompi_op_reduce(op, (char *)tmp_buf + (ptrdiff_t)count_lhalf * extent, - (char *)rbuf + count_lhalf * extent, count_rhalf, dtype); + COLL_BASE_REDUCE(session, op, (char *)tmp_buf + (ptrdiff_t)count_lhalf * extent, + (char *)rbuf + count_lhalf * extent, count_rhalf, dtype); /* Send the right half to the left neighbor */ err = MCA_PML_CALL(send((char *)rbuf + (ptrdiff_t)count_lhalf * extent, @@ -1084,7 +1089,7 @@ int ompi_coll_base_allreduce_intra_redscat_allgather( if (MPI_SUCCESS != err) { goto cleanup_and_return; } /* Reduce on the right half of the buffers (result in rbuf) */ - ompi_op_reduce(op, tmp_buf, rbuf, count_lhalf, dtype); + COLL_BASE_REDUCE(session, op, tmp_buf, rbuf, count_lhalf, dtype); /* Recv the right half from the right neighbor */ err = MCA_PML_CALL(recv((char *)rbuf + (ptrdiff_t)count_lhalf * extent, @@ -1165,9 +1170,9 @@ int ompi_coll_base_allreduce_intra_redscat_allgather( if (MPI_SUCCESS != err) { goto cleanup_and_return; } /* Local reduce: rbuf[] = tmp_buf[] rbuf[] */ - ompi_op_reduce(op, (char *)tmp_buf + (ptrdiff_t)rindex[step] * extent, - (char *)rbuf + (ptrdiff_t)rindex[step] * extent, - rcount[step], dtype); + COLL_BASE_REDUCE(session, op, (char *)tmp_buf + (ptrdiff_t)rindex[step] * extent, + (char *)rbuf + (ptrdiff_t)rindex[step] * extent, + rcount[step], dtype); /* Move the current window to the received message */ if (step + 1 < nsteps) { @@ -1234,8 +1239,7 @@ int ompi_coll_base_allreduce_intra_redscat_allgather( } cleanup_and_return: - if (NULL != tmp_buf_raw) - free(tmp_buf_raw); + COLL_SESSION_FREE(session, tmp_buf_raw); if (NULL != rindex) free(rindex); if (NULL != sindex) @@ -1268,7 +1272,8 @@ int ompi_coll_base_allreduce_intra_allgather_reduce(const void *sbuf, void *rbuf struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module, + ompi_op_gpu_session_t *session) { int line = -1; char *partial_buf = NULL; @@ -1289,10 +1294,10 @@ int ompi_coll_base_allreduce_intra_allgather_reduce(const void *sbuf, void *rbuf } ptrdiff_t buf_size, gap = 0; buf_size = opal_datatype_span(&dtype->super, (int64_t)count * size, &gap); - partial_buf = (char *) malloc(buf_size); + partial_buf = (char *) COLL_SESSION_ALLOC(session, buf_size); partial_buf_start = partial_buf - gap; buf_size = opal_datatype_span(&dtype->super, (int64_t)count, &gap); - tmpsend = (char *) malloc(buf_size); + tmpsend = (char *) COLL_SESSION_ALLOC(session, buf_size); tmpsend_start = tmpsend - gap; err = ompi_datatype_copy_content_same_ddt(dtype, count, @@ -1307,11 +1312,11 @@ int ompi_coll_base_allreduce_intra_allgather_reduce(const void *sbuf, void *rbuf if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } for (int target = 1; target < size; target++) { - ompi_op_reduce(op, - partial_buf_start + (ptrdiff_t)target * count * extent, - partial_buf_start, - count, - dtype); + COLL_BASE_REDUCE(session, op, + partial_buf_start + (ptrdiff_t)target * count * extent, + partial_buf_start, + count, + dtype); } // move data to rbuf @@ -1320,18 +1325,18 @@ int ompi_coll_base_allreduce_intra_allgather_reduce(const void *sbuf, void *rbuf (char*)partial_buf_start); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - if (NULL != partial_buf) free(partial_buf); - if (NULL != tmpsend) free(tmpsend); + COLL_SESSION_FREE(session, partial_buf); + COLL_SESSION_FREE(session, tmpsend); return MPI_SUCCESS; err_hndl: if (NULL != partial_buf) { - free(partial_buf); + COLL_SESSION_FREE(session, partial_buf); partial_buf = NULL; partial_buf_start = NULL; } if (NULL != tmpsend) { - free(tmpsend); + COLL_SESSION_FREE(session, tmpsend); tmpsend = NULL; tmpsend_start = NULL; } diff --git a/ompi/mca/coll/base/coll_base_exscan.c b/ompi/mca/coll/base/coll_base_exscan.c index d702eb361b9..d49b7912f7c 100644 --- a/ompi/mca/coll/base/coll_base_exscan.c +++ b/ompi/mca/coll/base/coll_base_exscan.c @@ -23,6 +23,7 @@ #include "ompi/mca/coll/base/coll_base_util.h" #include "ompi/mca/pml/pml.h" #include "ompi/op/op.h" +#include "ompi/op/op_gpu_session.h" /* * ompi_coll_base_exscan_intra_linear @@ -142,7 +143,7 @@ ompi_coll_base_exscan_intra_linear(const void *sbuf, void *rbuf, size_t count, int ompi_coll_base_exscan_intra_recursivedoubling( const void *sendbuf, void *recvbuf, size_t count, struct ompi_datatype_t *datatype, struct ompi_op_t *op, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module, ompi_op_gpu_session_t *session) { int err = MPI_SUCCESS; char *tmpsend_raw = NULL, *tmprecv_raw = NULL; @@ -158,8 +159,8 @@ int ompi_coll_base_exscan_intra_recursivedoubling( ptrdiff_t dsize, gap; dsize = opal_datatype_span(&datatype->super, count, &gap); - tmpsend_raw = malloc(dsize); - tmprecv_raw = malloc(dsize); + tmpsend_raw = COLL_SESSION_ALLOC(session, dsize); + tmprecv_raw = COLL_SESSION_ALLOC(session, dsize); if (NULL == tmpsend_raw || NULL == tmprecv_raw) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup_and_return; @@ -195,17 +196,17 @@ int ompi_coll_base_exscan_intra_recursivedoubling( is_first_block = 0; } else { /* Accumulate prefix reduction: recvbuf = precv recvbuf */ - ompi_op_reduce(op, precv, recvbuf, count, datatype); + COLL_BASE_REDUCE(session, op, precv, recvbuf, count, datatype); } /* Partial result: psend = precv psend */ - ompi_op_reduce(op, precv, psend, count, datatype); + COLL_BASE_REDUCE(session, op, precv, psend, count, datatype); } else { if (is_commute) { /* psend = precv psend */ - ompi_op_reduce(op, precv, psend, count, datatype); + COLL_BASE_REDUCE(session, op, precv, psend, count, datatype); } else { /* precv = psend precv */ - ompi_op_reduce(op, psend, precv, count, datatype); + COLL_BASE_REDUCE(session, op, psend, precv, count, datatype); char *tmp = psend; psend = precv; precv = tmp; @@ -215,9 +216,7 @@ int ompi_coll_base_exscan_intra_recursivedoubling( } cleanup_and_return: - if (NULL != tmpsend_raw) - free(tmpsend_raw); - if (NULL != tmprecv_raw) - free(tmprecv_raw); + COLL_SESSION_FREE(session, tmpsend_raw); + COLL_SESSION_FREE(session, tmprecv_raw); return err; } diff --git a/ompi/mca/coll/base/coll_base_functions.h b/ompi/mca/coll/base/coll_base_functions.h index b59ef55f437..b2ab8a16d57 100644 --- a/ompi/mca/coll/base/coll_base_functions.h +++ b/ompi/mca/coll/base/coll_base_functions.h @@ -36,6 +36,35 @@ #include "ompi/mca/coll/coll.h" #include "ompi/info/info.h" #include "ompi/request/request.h" +#include "opal/mca/allocator/allocator.h" +#include "ompi/op/op_gpu_session.h" + +/* Allocator-aware helpers for Pattern-A scratch buffers. + * Pass allocator=NULL to fall back to plain malloc/free. */ +#define COLL_BASE_ALLOC(allocator, size) \ + ((allocator) ? (allocator)->alc_alloc((allocator), (size), 0) : malloc(size)) + +#define COLL_BASE_FREE(allocator, ptr) \ + do { if (ptr) { if (allocator) (allocator)->alc_free((allocator), (ptr)); \ + else free(ptr); } } while (0) + +/* GPU session-aware helpers for reduction scratch buffers. + * When session is non-NULL, use session->allocator; otherwise fall back to + * plain malloc/free. Pass session=NULL for non-GPU collectives. */ +#define COLL_BASE_REDUCE(session, op, src, dst, count, dtype) \ + do { \ + if (NULL != (session)) \ + ompi_op_gpu_session_reduce((session), (src), (dst), (dst), (count)); \ + else \ + ompi_op_reduce((op), (src), (dst), (count), (dtype)); \ + } while (0) + +#define COLL_SESSION_ALLOC(session, size) \ + ((session) ? COLL_BASE_ALLOC((session)->allocator, (size)) : malloc(size)) + +#define COLL_SESSION_FREE(session, ptr) \ + do { if (session) { COLL_BASE_FREE((session)->allocator, (ptr)); } \ + else { if (ptr) free(ptr); } } while (0) /* need to include our own topo prototypes so we can malloc data on the comm correctly */ #include "coll_base_topo.h" @@ -193,7 +222,7 @@ int ompi_coll_base_allgather_intra_ring(ALLGATHER_ARGS); int ompi_coll_base_allgather_intra_neighborexchange(ALLGATHER_ARGS); int ompi_coll_base_allgather_intra_basic_linear(ALLGATHER_ARGS); int ompi_coll_base_allgather_intra_two_procs(ALLGATHER_ARGS); -int ompi_coll_base_allgather_intra_k_bruck(ALLGATHER_ARGS, int radix); +int ompi_coll_base_allgather_intra_k_bruck(ALLGATHER_ARGS, int radix, mca_allocator_base_module_t *allocator); int ompi_coll_base_allgather_direct_messaging(ALLGATHER_ARGS); /* All GatherV */ @@ -206,12 +235,12 @@ int ompi_coll_base_allgatherv_intra_two_procs(ALLGATHERV_ARGS); /* All Reduce */ int ompi_coll_base_allreduce_intra_nonoverlapping(ALLREDUCE_ARGS); -int ompi_coll_base_allreduce_intra_recursivedoubling(ALLREDUCE_ARGS); -int ompi_coll_base_allreduce_intra_ring(ALLREDUCE_ARGS); -int ompi_coll_base_allreduce_intra_ring_segmented(ALLREDUCE_ARGS, uint32_t segsize); +int ompi_coll_base_allreduce_intra_recursivedoubling(ALLREDUCE_ARGS, ompi_op_gpu_session_t *session); +int ompi_coll_base_allreduce_intra_ring(ALLREDUCE_ARGS, ompi_op_gpu_session_t *session); +int ompi_coll_base_allreduce_intra_ring_segmented(ALLREDUCE_ARGS, uint32_t segsize, ompi_op_gpu_session_t *session); int ompi_coll_base_allreduce_intra_basic_linear(ALLREDUCE_ARGS); -int ompi_coll_base_allreduce_intra_redscat_allgather(ALLREDUCE_ARGS); -int ompi_coll_base_allreduce_intra_allgather_reduce(ALLREDUCE_ARGS); +int ompi_coll_base_allreduce_intra_redscat_allgather(ALLREDUCE_ARGS, ompi_op_gpu_session_t *session); +int ompi_coll_base_allreduce_intra_allgather_reduce(ALLREDUCE_ARGS, ompi_op_gpu_session_t *session); /* AlltoAll */ int ompi_coll_base_alltoall_intra_pairwise(ALLTOALL_ARGS); @@ -255,48 +284,46 @@ int ompi_coll_base_bcast_intra_scatter_allgather(BCAST_ARGS, uint32_t segsize); int ompi_coll_base_bcast_intra_scatter_allgather_ring(BCAST_ARGS, uint32_t segsize); /* Exscan */ -int ompi_coll_base_exscan_intra_recursivedoubling(EXSCAN_ARGS); +int ompi_coll_base_exscan_intra_recursivedoubling(EXSCAN_ARGS, ompi_op_gpu_session_t *session); int ompi_coll_base_exscan_intra_linear(EXSCAN_ARGS); -int ompi_coll_base_exscan_intra_recursivedoubling(EXSCAN_ARGS); /* Gather */ int ompi_coll_base_gather_intra_basic_linear(GATHER_ARGS); -int ompi_coll_base_gather_intra_binomial(GATHER_ARGS); +int ompi_coll_base_gather_intra_binomial(GATHER_ARGS, mca_allocator_base_module_t *allocator); int ompi_coll_base_gather_intra_linear_sync(GATHER_ARGS, int first_segment_size); /* GatherV */ /* Reduce */ -int ompi_coll_base_reduce_generic(REDUCE_ARGS, ompi_coll_tree_t* tree, size_t count_by_segment, int max_outstanding_reqs); +int ompi_coll_base_reduce_generic(REDUCE_ARGS, ompi_coll_tree_t* tree, size_t count_by_segment, int max_outstanding_reqs, ompi_op_gpu_session_t *session); int ompi_coll_base_reduce_intra_basic_linear(REDUCE_ARGS); -int ompi_coll_base_reduce_intra_chain(REDUCE_ARGS, uint32_t segsize, int fanout, int max_outstanding_reqs ); -int ompi_coll_base_reduce_intra_pipeline(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs ); -int ompi_coll_base_reduce_intra_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs ); -int ompi_coll_base_reduce_intra_binomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs ); -int ompi_coll_base_reduce_intra_in_order_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs ); -int ompi_coll_base_reduce_intra_redscat_gather(REDUCE_ARGS); -int ompi_coll_base_reduce_intra_knomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs, int radix); +int ompi_coll_base_reduce_intra_chain(REDUCE_ARGS, uint32_t segsize, int fanout, int max_outstanding_reqs, ompi_op_gpu_session_t *session); +int ompi_coll_base_reduce_intra_pipeline(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs, ompi_op_gpu_session_t *session); +int ompi_coll_base_reduce_intra_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs, ompi_op_gpu_session_t *session); +int ompi_coll_base_reduce_intra_binomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs, ompi_op_gpu_session_t *session); +int ompi_coll_base_reduce_intra_in_order_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs, ompi_op_gpu_session_t *session); +int ompi_coll_base_reduce_intra_redscat_gather(REDUCE_ARGS, ompi_op_gpu_session_t *session); +int ompi_coll_base_reduce_intra_knomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs, int radix, ompi_op_gpu_session_t *session); /* Reduce_scatter */ -int ompi_coll_base_reduce_scatter_intra_nonoverlapping(REDUCESCATTER_ARGS); -int ompi_coll_base_reduce_scatter_intra_basic_recursivehalving(REDUCESCATTER_ARGS); -int ompi_coll_base_reduce_scatter_intra_ring(REDUCESCATTER_ARGS); -int ompi_coll_base_reduce_scatter_intra_butterfly(REDUCESCATTER_ARGS); +int ompi_coll_base_reduce_scatter_intra_nonoverlapping(REDUCESCATTER_ARGS, ompi_op_gpu_session_t *session); +int ompi_coll_base_reduce_scatter_intra_basic_recursivehalving(REDUCESCATTER_ARGS, ompi_op_gpu_session_t *session); +int ompi_coll_base_reduce_scatter_intra_ring(REDUCESCATTER_ARGS, ompi_op_gpu_session_t *session); +int ompi_coll_base_reduce_scatter_intra_butterfly(REDUCESCATTER_ARGS, ompi_op_gpu_session_t *session); /* Reduce_scatter_block */ -int ompi_coll_base_reduce_scatter_block_basic_linear(REDUCESCATTERBLOCK_ARGS); -int ompi_coll_base_reduce_scatter_block_intra_recursivedoubling(REDUCESCATTERBLOCK_ARGS); -int ompi_coll_base_reduce_scatter_block_intra_recursivehalving(REDUCESCATTERBLOCK_ARGS); -int ompi_coll_base_reduce_scatter_block_intra_butterfly(REDUCESCATTERBLOCK_ARGS); +int ompi_coll_base_reduce_scatter_block_basic_linear(REDUCESCATTERBLOCK_ARGS, ompi_op_gpu_session_t *session); +int ompi_coll_base_reduce_scatter_block_intra_recursivedoubling(REDUCESCATTERBLOCK_ARGS, ompi_op_gpu_session_t *session); +int ompi_coll_base_reduce_scatter_block_intra_recursivehalving(REDUCESCATTERBLOCK_ARGS, ompi_op_gpu_session_t *session); +int ompi_coll_base_reduce_scatter_block_intra_butterfly(REDUCESCATTERBLOCK_ARGS, ompi_op_gpu_session_t *session); /* Scan */ -int ompi_coll_base_scan_intra_recursivedoubling(SCAN_ARGS); +int ompi_coll_base_scan_intra_recursivedoubling(SCAN_ARGS, ompi_op_gpu_session_t *session); int ompi_coll_base_scan_intra_linear(SCAN_ARGS); -int ompi_coll_base_scan_intra_recursivedoubling(SCAN_ARGS); /* Scatter */ int ompi_coll_base_scatter_intra_basic_linear(SCATTER_ARGS); -int ompi_coll_base_scatter_intra_binomial(SCATTER_ARGS); +int ompi_coll_base_scatter_intra_binomial(SCATTER_ARGS, mca_allocator_base_module_t *allocator); int ompi_coll_base_scatter_intra_linear_nb(SCATTER_ARGS, int max_reqs); /* ScatterV */ diff --git a/ompi/mca/coll/base/coll_base_gather.c b/ompi/mca/coll/base/coll_base_gather.c index a4486152157..894af0a085a 100644 --- a/ompi/mca/coll/base/coll_base_gather.c +++ b/ompi/mca/coll/base/coll_base_gather.c @@ -44,7 +44,8 @@ ompi_coll_base_gather_intra_binomial(const void *sbuf, size_t scount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module, + mca_allocator_base_module_t *allocator) { int line = -1, i, rank, vrank, size, err; size_t total_recv = 0; @@ -82,7 +83,7 @@ ompi_coll_base_gather_intra_binomial(const void *sbuf, size_t scount, } else { /* root is not on 0, allocate temp buffer for recv, * rotate data at the end */ - tempbuf = (char *) malloc(rsize); + tempbuf = (char *) COLL_BASE_ALLOC(allocator, rsize); if (NULL == tempbuf) { err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; } @@ -107,7 +108,7 @@ ompi_coll_base_gather_intra_binomial(const void *sbuf, size_t scount, * to the property of binimoal tree */ ompi_datatype_type_extent(sdtype, &sextent); ssize = opal_datatype_span(&sdtype->super, (int64_t)scount * size, &sgap); - tempbuf = (char *) malloc(ssize); + tempbuf = (char *) COLL_BASE_ALLOC(allocator, ssize); if (NULL == tempbuf) { err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; } @@ -180,17 +181,16 @@ ompi_coll_base_gather_intra_binomial(const void *sbuf, size_t scount, (char *) rbuf, ptmp + rextent * (ptrdiff_t)rcount * (ptrdiff_t)(size-root)); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - free(tempbuf); + COLL_BASE_FREE(allocator, tempbuf); } } else if (!(vrank % 2)) { /* other non-leaf nodes */ - free(tempbuf); + COLL_BASE_FREE(allocator, tempbuf); } return MPI_SUCCESS; err_hndl: - if (NULL != tempbuf) - free(tempbuf); + COLL_BASE_FREE(allocator, tempbuf); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); diff --git a/ompi/mca/coll/base/coll_base_reduce.c b/ompi/mca/coll/base/coll_base_reduce.c index f127abef311..3a2a50171b0 100644 --- a/ompi/mca/coll/base/coll_base_reduce.c +++ b/ompi/mca/coll/base/coll_base_reduce.c @@ -38,6 +38,7 @@ #include "ompi/mca/pml/pml.h" #include "ompi/op/op.h" #include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/op/op_gpu_session.h" #include "coll_base_topo.h" #include "coll_base_util.h" @@ -66,7 +67,7 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, size_t or int root, ompi_communicator_t* comm, mca_coll_base_module_t *module, ompi_coll_tree_t* tree, size_t count_by_segment, - int max_outstanding_reqs ) + int max_outstanding_reqs, ompi_op_gpu_session_t *session ) { char *inbuf[2] = {NULL, NULL}, *inbuf_free[2] = {NULL, NULL}; char *accumbuf = NULL, *accumbuf_free = NULL; @@ -106,7 +107,7 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, size_t or if( (NULL == accumbuf) || (root != rank) ) { /* Allocate temporary accumulator buffer. */ size = opal_datatype_span(&datatype->super, original_count, &gap); - accumbuf_free = (char*)malloc(size); + accumbuf_free = (char*)COLL_SESSION_ALLOC(session, size); if (accumbuf_free == NULL) { line = __LINE__; ret = -1; goto error_hndl; } @@ -123,7 +124,7 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, size_t or } /* Allocate two buffers for incoming segments */ real_segment_size = opal_datatype_span(&datatype->super, count_by_segment, &gap); - inbuf_free[0] = (char*) malloc(real_segment_size); + inbuf_free[0] = (char*) COLL_SESSION_ALLOC(session, real_segment_size); if( inbuf_free[0] == NULL ) { line = __LINE__; ret = -1; goto error_hndl; } @@ -131,7 +132,7 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, size_t or /* if there is chance to overlap communication - allocate second buffer */ if( (num_segments > 1) || (tree->tree_nextsize > 1) ) { - inbuf_free[1] = (char*) malloc(real_segment_size); + inbuf_free[1] = (char*) COLL_SESSION_ALLOC(session, real_segment_size); if( inbuf_free[1] == NULL ) { line = __LINE__; ret = -1; goto error_hndl; } @@ -202,9 +203,9 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, size_t or } } /* apply operation */ - ompi_op_reduce(op, local_op_buffer, - accumbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, - recvcount, datatype ); + COLL_BASE_REDUCE(session, op, local_op_buffer, + accumbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, + recvcount, datatype); } else if ( segindex > 0 ) { void* accumulator = accumbuf + (ptrdiff_t)(segindex-1) * (ptrdiff_t)segment_increment; if( tree->tree_nextsize <= 1 ) { @@ -213,8 +214,8 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, size_t or local_op_buffer = sendtmpbuf + (ptrdiff_t)(segindex-1) * (ptrdiff_t)segment_increment; } } - ompi_op_reduce(op, local_op_buffer, accumulator, prevcount, - datatype ); + COLL_BASE_REDUCE(session, op, local_op_buffer, accumulator, prevcount, + datatype); /* all reduced on available data this step (i) complete, * pass to the next process unless you are the root. @@ -242,9 +243,9 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, size_t or } /* end of for each segment */ /* clean up */ - if( inbuf_free[0] != NULL) free(inbuf_free[0]); - if( inbuf_free[1] != NULL) free(inbuf_free[1]); - if( accumbuf_free != NULL ) free(accumbuf_free); + COLL_SESSION_FREE(session, inbuf_free[0]); + COLL_SESSION_FREE(session, inbuf_free[1]); + COLL_SESSION_FREE(session, accumbuf_free); } /* leaf nodes @@ -365,9 +366,9 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, size_t or } ompi_coll_base_free_reqs(sreq, max_outstanding_reqs); } - if( inbuf_free[0] != NULL ) free(inbuf_free[0]); - if( inbuf_free[1] != NULL ) free(inbuf_free[1]); - if( accumbuf_free != NULL ) free(accumbuf); + COLL_SESSION_FREE(session, inbuf_free[0]); + COLL_SESSION_FREE(session, inbuf_free[1]); + COLL_SESSION_FREE(session, accumbuf_free); OPAL_OUTPUT (( ompi_coll_base_framework.framework_output, "ERROR_HNDL: node %d file %s line %d error %d\n", rank, __FILE__, line, ret )); @@ -388,7 +389,7 @@ int ompi_coll_base_reduce_intra_chain( const void *sendbuf, void *recvbuf, size_ ompi_communicator_t* comm, mca_coll_base_module_t *module, uint32_t segsize, int fanout, - int max_outstanding_reqs ) + int max_outstanding_reqs, ompi_op_gpu_session_t *session ) { size_t segcount = count; size_t typelng; @@ -408,7 +409,7 @@ int ompi_coll_base_reduce_intra_chain( const void *sendbuf, void *recvbuf, size_ return ompi_coll_base_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm, module, data->cached_chain, - segcount, max_outstanding_reqs ); + segcount, max_outstanding_reqs, session ); } @@ -418,7 +419,7 @@ int ompi_coll_base_reduce_intra_pipeline( const void *sendbuf, void *recvbuf, ompi_communicator_t* comm, mca_coll_base_module_t *module, uint32_t segsize, - int max_outstanding_reqs ) + int max_outstanding_reqs, ompi_op_gpu_session_t *session ) { size_t segcount = count; size_t typelng; @@ -440,7 +441,7 @@ int ompi_coll_base_reduce_intra_pipeline( const void *sendbuf, void *recvbuf, return ompi_coll_base_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm, module, data->cached_pipeline, - segcount, max_outstanding_reqs ); + segcount, max_outstanding_reqs, session ); } int ompi_coll_base_reduce_intra_binary( const void *sendbuf, void *recvbuf, @@ -449,7 +450,7 @@ int ompi_coll_base_reduce_intra_binary( const void *sendbuf, void *recvbuf, ompi_communicator_t* comm, mca_coll_base_module_t *module, uint32_t segsize, - int max_outstanding_reqs ) + int max_outstanding_reqs, ompi_op_gpu_session_t *session ) { size_t segcount = count; size_t typelng; @@ -471,7 +472,7 @@ int ompi_coll_base_reduce_intra_binary( const void *sendbuf, void *recvbuf, return ompi_coll_base_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm, module, data->cached_bintree, - segcount, max_outstanding_reqs ); + segcount, max_outstanding_reqs, session ); } int ompi_coll_base_reduce_intra_binomial( const void *sendbuf, void *recvbuf, @@ -480,7 +481,7 @@ int ompi_coll_base_reduce_intra_binomial( const void *sendbuf, void *recvbuf, ompi_communicator_t* comm, mca_coll_base_module_t *module, uint32_t segsize, - int max_outstanding_reqs ) + int max_outstanding_reqs, ompi_op_gpu_session_t *session ) { size_t segcount = count; size_t typelng; @@ -502,7 +503,7 @@ int ompi_coll_base_reduce_intra_binomial( const void *sendbuf, void *recvbuf, return ompi_coll_base_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm, module, data->cached_in_order_bmtree, - segcount, max_outstanding_reqs ); + segcount, max_outstanding_reqs, session ); } /* @@ -519,7 +520,7 @@ int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recv ompi_communicator_t* comm, mca_coll_base_module_t *module, uint32_t segsize, - int max_outstanding_reqs ) + int max_outstanding_reqs, ompi_op_gpu_session_t *session ) { int ret, rank, size, io_root, segcount = count; void *use_this_sendbuf = NULL; @@ -560,7 +561,7 @@ int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recv dsize = opal_datatype_span(&datatype->super, count, &gap); if ((root == rank) && (MPI_IN_PLACE == sendbuf)) { - tmpbuf_free = (char *) malloc(dsize); + tmpbuf_free = (char *) COLL_SESSION_ALLOC(session, dsize); if (NULL == tmpbuf_free) { return MPI_ERR_INTERN; } @@ -570,7 +571,7 @@ int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recv (char*)recvbuf); use_this_sendbuf = tmpbuf; } else if (io_root == rank) { - tmpbuf_free = (char *) malloc(dsize); + tmpbuf_free = (char *) COLL_SESSION_ALLOC(session, dsize); if (NULL == tmpbuf_free) { return MPI_ERR_INTERN; } @@ -583,9 +584,9 @@ int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recv ret = ompi_coll_base_reduce_generic( use_this_sendbuf, use_this_recvbuf, count, datatype, op, io_root, comm, module, data->cached_in_order_bintree, - segcount, max_outstanding_reqs ); + segcount, max_outstanding_reqs, session ); if (MPI_SUCCESS != ret) { - free(tmpbuf_free); + COLL_SESSION_FREE(session, tmpbuf_free); return ret; } @@ -597,7 +598,7 @@ int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recv MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != ret) { - free(tmpbuf_free); + COLL_SESSION_FREE(session, tmpbuf_free); return ret; } @@ -607,13 +608,13 @@ int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recv MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != ret) { - free(tmpbuf_free); + COLL_SESSION_FREE(session, tmpbuf_free); return ret; } } } if (NULL != tmpbuf_free) { - free(tmpbuf_free); + COLL_SESSION_FREE(session, tmpbuf_free); } return MPI_SUCCESS; @@ -812,7 +813,7 @@ ompi_coll_base_reduce_intra_basic_linear(const void *sbuf, void *rbuf, size_t co int ompi_coll_base_reduce_intra_redscat_gather( const void *sbuf, void *rbuf, size_t count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module, ompi_op_gpu_session_t *session) { int comm_size = ompi_comm_size(comm); int rank = ompi_comm_rank(comm); @@ -844,7 +845,7 @@ int ompi_coll_base_reduce_intra_redscat_gather( /* Temporary buffers */ char *tmp_buf_raw = NULL, *rbuf_raw = NULL; - tmp_buf_raw = malloc(dsize); + tmp_buf_raw = COLL_SESSION_ALLOC(session, dsize); if (NULL == tmp_buf_raw) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup_and_return; @@ -852,7 +853,7 @@ int ompi_coll_base_reduce_intra_redscat_gather( char *tmp_buf = tmp_buf_raw - gap; if (rank != root) { - rbuf_raw = malloc(dsize); + rbuf_raw = COLL_SESSION_ALLOC(session, dsize); if (NULL == rbuf_raw) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup_and_return; @@ -906,8 +907,8 @@ int ompi_coll_base_reduce_intra_redscat_gather( if (MPI_SUCCESS != err) { goto cleanup_and_return; } /* Reduce on the right half of the buffers (result in rbuf) */ - ompi_op_reduce(op, (char *)tmp_buf + (ptrdiff_t)count_lhalf * extent, - (char *)rbuf + count_lhalf * extent, count_rhalf, dtype); + COLL_BASE_REDUCE(session, op, (char *)tmp_buf + (ptrdiff_t)count_lhalf * extent, + (char *)rbuf + count_lhalf * extent, count_rhalf, dtype); /* Send the right half to the left neighbor */ err = MCA_PML_CALL(send((char *)rbuf + (ptrdiff_t)count_lhalf * extent, @@ -934,7 +935,7 @@ int ompi_coll_base_reduce_intra_redscat_gather( if (MPI_SUCCESS != err) { goto cleanup_and_return; } /* Reduce on the right half of the buffers (result in rbuf) */ - ompi_op_reduce(op, tmp_buf, rbuf, count_lhalf, dtype); + COLL_BASE_REDUCE(session, op, tmp_buf, rbuf, count_lhalf, dtype); /* Recv the right half from the right neighbor */ err = MCA_PML_CALL(recv((char *)rbuf + (ptrdiff_t)count_lhalf * extent, @@ -1016,9 +1017,9 @@ int ompi_coll_base_reduce_intra_redscat_gather( if (MPI_SUCCESS != err) { goto cleanup_and_return; } /* Local reduce: rbuf[] = tmp_buf[] rbuf[] */ - ompi_op_reduce(op, (char *)tmp_buf + (ptrdiff_t)rindex[step] * extent, - (char *)rbuf + (ptrdiff_t)rindex[step] * extent, - rcount[step], dtype); + COLL_BASE_REDUCE(session, op, (char *)tmp_buf + (ptrdiff_t)rindex[step] * extent, + (char *)rbuf + (ptrdiff_t)rindex[step] * extent, + rcount[step], dtype); /* Move the current window to the received message */ if (step + 1 < nsteps) { @@ -1129,10 +1130,8 @@ int ompi_coll_base_reduce_intra_redscat_gather( } cleanup_and_return: - if (NULL != tmp_buf_raw) - free(tmp_buf_raw); - if (NULL != rbuf_raw) - free(rbuf_raw); + COLL_SESSION_FREE(session, tmp_buf_raw); + COLL_SESSION_FREE(session, rbuf_raw); if (NULL != rindex) free(rindex); if (NULL != sindex) @@ -1170,7 +1169,7 @@ int ompi_coll_base_reduce_intra_knomial( const void *sendbuf, void *recvbuf, ompi_communicator_t* comm, mca_coll_base_module_t *module, uint32_t segsize, - int max_outstanding_reqs, int radix) + int max_outstanding_reqs, int radix, ompi_op_gpu_session_t *session) { int err = OMPI_SUCCESS, rank, line; ptrdiff_t extent, lb; @@ -1215,7 +1214,7 @@ int ompi_coll_base_reduce_intra_knomial( const void *sendbuf, void *recvbuf, sendtmpbuf = (char *)recvbuf; } buf_size = opal_datatype_span(&datatype->super, (int64_t)count, &gap); - reduce_buf = (char *)malloc(buf_size); + reduce_buf = (char *)COLL_SESSION_ALLOC(session, buf_size); reduce_buf_start = reduce_buf - gap; err = ompi_datatype_copy_content_same_ddt(datatype, count, (char*)reduce_buf_start, @@ -1227,7 +1226,7 @@ int ompi_coll_base_reduce_intra_knomial( const void *sendbuf, void *recvbuf, max_reqs = num_children; if(!is_leaf) { buf_size = opal_datatype_span(&datatype->super, (int64_t)count * num_children, &gap); - child_buf = (char *)malloc(buf_size); + child_buf = (char *)COLL_SESSION_ALLOC(session, buf_size); child_buf_start = child_buf - gap; reqs = ompi_coll_base_comm_get_reqs(data, max_reqs); } @@ -1250,11 +1249,10 @@ int ompi_coll_base_reduce_intra_knomial( const void *sendbuf, void *recvbuf, } for (int i = 0; i < num_children; i++) { - ompi_op_reduce(op, - child_buf_start + (ptrdiff_t)i * count * extent, - reduce_buf, - count, - datatype); + COLL_BASE_REDUCE(session, op, + child_buf_start + (ptrdiff_t)i * count * extent, + reduce_buf, + count, datatype); } if (rank != root) { @@ -1275,18 +1273,18 @@ int ompi_coll_base_reduce_intra_knomial( const void *sendbuf, void *recvbuf, if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } - if (NULL != child_buf) free(child_buf); - if (NULL != reduce_buf) free(reduce_buf); + COLL_SESSION_FREE(session, child_buf); + COLL_SESSION_FREE(session, reduce_buf); return MPI_SUCCESS; err_hndl: if (NULL != child_buf) { - free(child_buf); + COLL_SESSION_FREE(session, child_buf); child_buf = NULL; child_buf_start = NULL; } if (NULL != reduce_buf) { - free(reduce_buf); + COLL_SESSION_FREE(session, reduce_buf); reduce_buf = NULL; reduce_buf_start = NULL; } diff --git a/ompi/mca/coll/base/coll_base_reduce_scatter.c b/ompi/mca/coll/base/coll_base_reduce_scatter.c index 7a838936378..62930cccedd 100644 --- a/ompi/mca/coll/base/coll_base_reduce_scatter.c +++ b/ompi/mca/coll/base/coll_base_reduce_scatter.c @@ -34,6 +34,7 @@ #include "ompi/mca/coll/base/coll_tags.h" #include "ompi/mca/pml/pml.h" #include "ompi/op/op.h" +#include "ompi/op/op_gpu_session.h" #include "ompi/mca/coll/base/coll_base_functions.h" #include "coll_base_topo.h" #include "coll_base_util.h" @@ -49,7 +50,8 @@ int ompi_coll_base_reduce_scatter_intra_nonoverlapping(const void *sbuf, void *r struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module, + ompi_op_gpu_session_t *session) { int err, i, rank, size, total_count; ptrdiff_t *displs = NULL; @@ -82,14 +84,14 @@ int ompi_coll_base_reduce_scatter_intra_nonoverlapping(const void *sbuf, void *r ptrdiff_t dsize, gap = 0; dsize = opal_datatype_span(&dtype->super, total_count, &gap); - tmprbuf_free = (char*) malloc(dsize); + tmprbuf_free = (char*) COLL_SESSION_ALLOC(session, dsize); tmprbuf = tmprbuf_free - gap; } err = comm->c_coll->coll_reduce (sbuf, tmprbuf, total_count, dtype, op, root, comm, comm->c_coll->coll_reduce_module); } if (MPI_SUCCESS != err) { - if (NULL != tmprbuf_free) free(tmprbuf_free); + COLL_SESSION_FREE(session, tmprbuf_free); return err; } @@ -109,7 +111,7 @@ int ompi_coll_base_reduce_scatter_intra_nonoverlapping(const void *sbuf, void *r root, comm, comm->c_coll->coll_scatterv_module); } free(displs); - if (NULL != tmprbuf_free) free(tmprbuf_free); + COLL_SESSION_FREE(session, tmprbuf_free); return err; } @@ -138,7 +140,8 @@ ompi_coll_base_reduce_scatter_intra_basic_recursivehalving( const void *sbuf, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module, + ompi_op_gpu_session_t *session) { int i, rank, size, err = OMPI_SUCCESS; int tmp_size, remain = 0, tmp_rank; @@ -180,7 +183,7 @@ ompi_coll_base_reduce_scatter_intra_basic_recursivehalving( const void *sbuf, } /* Allocate temporary receive buffer. */ - recv_buf_free = (char*) malloc(buf_size); + recv_buf_free = (char*) COLL_SESSION_ALLOC(session, buf_size); recv_buf = recv_buf_free - gap; if (NULL == recv_buf_free) { err = OMPI_ERR_OUT_OF_RESOURCE; @@ -188,7 +191,7 @@ ompi_coll_base_reduce_scatter_intra_basic_recursivehalving( const void *sbuf, } /* allocate temporary buffer for results */ - result_buf_free = (char*) malloc(buf_size); + result_buf_free = (char*) COLL_SESSION_ALLOC(session, buf_size); result_buf = result_buf_free - gap; /* copy local buffer into the temporary results */ @@ -221,7 +224,7 @@ ompi_coll_base_reduce_scatter_intra_basic_recursivehalving( const void *sbuf, comm, MPI_STATUS_IGNORE)); /* integrate their results into our temp results */ - ompi_op_reduce(op, recv_buf, result_buf, count, dtype); + COLL_BASE_REDUCE(session, op, recv_buf, result_buf, count, dtype); /* adjust rank to be the bottom "remain" ranks */ tmp_rank = rank / 2; @@ -339,10 +342,10 @@ ompi_coll_base_reduce_scatter_intra_basic_recursivehalving( const void *sbuf, goto cleanup; } - ompi_op_reduce(op, - recv_buf + tmp_disps[recv_index] * extent, - result_buf + tmp_disps[recv_index] * extent, - recv_count, dtype); + COLL_BASE_REDUCE(session, op, + recv_buf + tmp_disps[recv_index] * extent, + result_buf + tmp_disps[recv_index] * extent, + recv_count, dtype); } /* update for next iteration */ @@ -391,8 +394,8 @@ ompi_coll_base_reduce_scatter_intra_basic_recursivehalving( const void *sbuf, cleanup: if (NULL != disps) free(disps); - if (NULL != recv_buf_free) free(recv_buf_free); - if (NULL != result_buf_free) free(result_buf_free); + COLL_SESSION_FREE(session, recv_buf_free); + COLL_SESSION_FREE(session, result_buf_free); return err; } @@ -464,7 +467,8 @@ ompi_coll_base_reduce_scatter_intra_ring( const void *sbuf, void *rbuf, ompi_cou struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module, + ompi_op_gpu_session_t *session) { int ret, line, rank, size, i, k, recv_from, send_to; int inbi; @@ -518,15 +522,15 @@ ompi_coll_base_reduce_scatter_intra_ring( const void *sbuf, void *rbuf, ompi_cou max_real_segsize = opal_datatype_span(&dtype->super, max_block_count, &gap); dsize = opal_datatype_span(&dtype->super, total_count, &gap); - accumbuf_free = (char*)malloc(dsize); + accumbuf_free = (char*)COLL_SESSION_ALLOC(session, dsize); if (NULL == accumbuf_free) { ret = -1; line = __LINE__; goto error_hndl; } accumbuf = accumbuf_free - gap; - inbuf_free[0] = (char*)malloc(max_real_segsize); + inbuf_free[0] = (char*)COLL_SESSION_ALLOC(session, max_real_segsize); if (NULL == inbuf_free[0]) { ret = -1; line = __LINE__; goto error_hndl; } inbuf[0] = inbuf_free[0] - gap; if (size > 2) { - inbuf_free[1] = (char*)malloc(max_real_segsize); + inbuf_free[1] = (char*)COLL_SESSION_ALLOC(session, max_real_segsize); if (NULL == inbuf_free[1]) { ret = -1; line = __LINE__; goto error_hndl; } inbuf[1] = inbuf_free[1] - gap; } @@ -591,7 +595,7 @@ ompi_coll_base_reduce_scatter_intra_ring( const void *sbuf, void *rbuf, ompi_cou rbuf[prevblock] = inbuf[inbi ^ 0x1] (op) rbuf[prevblock] */ tmprecv = accumbuf + displs[prevblock] * extent; - ompi_op_reduce(op, inbuf[inbi ^ 0x1], tmprecv, ompi_count_array_get(rcounts, prevblock), dtype); + COLL_BASE_REDUCE(session, op, inbuf[inbi ^ 0x1], tmprecv, ompi_count_array_get(rcounts, prevblock), dtype); /* send previous block to send_to */ ret = MCA_PML_CALL(send(tmprecv, ompi_count_array_get(rcounts, prevblock), dtype, send_to, @@ -607,7 +611,7 @@ ompi_coll_base_reduce_scatter_intra_ring( const void *sbuf, void *rbuf, ompi_cou /* Apply operation on the last block (my block) rbuf[rank] = inbuf[inbi] (op) rbuf[rank] */ tmprecv = accumbuf + displs[rank] * extent; - ompi_op_reduce(op, inbuf[inbi], tmprecv, ompi_count_array_get(rcounts, rank), dtype); + COLL_BASE_REDUCE(session, op, inbuf[inbi], tmprecv, ompi_count_array_get(rcounts, rank), dtype); /* Copy result from tmprecv to rbuf */ ret = ompi_datatype_copy_content_same_ddt(dtype, ompi_count_array_get(rcounts, rank), @@ -615,9 +619,9 @@ ompi_coll_base_reduce_scatter_intra_ring( const void *sbuf, void *rbuf, ompi_cou if (ret < 0) { line = __LINE__; goto error_hndl; } if (NULL != displs) free(displs); - if (NULL != accumbuf_free) free(accumbuf_free); - if (NULL != inbuf_free[0]) free(inbuf_free[0]); - if (NULL != inbuf_free[1]) free(inbuf_free[1]); + COLL_SESSION_FREE(session, accumbuf_free); + COLL_SESSION_FREE(session, inbuf_free[0]); + COLL_SESSION_FREE(session, inbuf_free[1]); return MPI_SUCCESS; @@ -626,9 +630,9 @@ ompi_coll_base_reduce_scatter_intra_ring( const void *sbuf, void *rbuf, ompi_cou __FILE__, line, rank, ret)); (void)line; // silence compiler warning if (NULL != displs) free(displs); - if (NULL != accumbuf_free) free(accumbuf_free); - if (NULL != inbuf_free[0]) free(inbuf_free[0]); - if (NULL != inbuf_free[1]) free(inbuf_free[1]); + COLL_SESSION_FREE(session, accumbuf_free); + COLL_SESSION_FREE(session, inbuf_free[0]); + COLL_SESSION_FREE(session, inbuf_free[1]); return ret; } @@ -701,7 +705,7 @@ int ompi_coll_base_reduce_scatter_intra_butterfly( const void *sbuf, void *rbuf, ompi_count_array_t rcounts, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module, ompi_op_gpu_session_t *session) { char *tmpbuf[2] = {NULL, NULL}, *psend, *precv; ptrdiff_t *displs = NULL, index; @@ -729,8 +733,8 @@ ompi_coll_base_reduce_scatter_intra_butterfly( ompi_datatype_type_extent(dtype, &extent); span = opal_datatype_span(&dtype->super, totalcount, &gap); - tmpbuf[0] = malloc(span); - tmpbuf[1] = malloc(span); + tmpbuf[0] = COLL_SESSION_ALLOC(session, span); + tmpbuf[1] = COLL_SESSION_ALLOC(session, span); if (NULL == tmpbuf[0] || NULL == tmpbuf[1]) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup_and_return; @@ -781,7 +785,7 @@ ompi_coll_base_reduce_scatter_intra_butterfly( MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm, MPI_STATUS_IGNORE)); if (OMPI_SUCCESS != err) { goto cleanup_and_return; } - ompi_op_reduce(op, precv, psend, totalcount, dtype); + COLL_BASE_REDUCE(session, op, precv, psend, totalcount, dtype); /* Adjust rank to be the bottom "remain" ranks */ vrank = rank / 2; } @@ -837,15 +841,15 @@ ompi_coll_base_reduce_scatter_intra_butterfly( if (vrank < vpeer) { /* precv = psend precv */ - ompi_op_reduce(op, psend + rdispl * extent, - precv + rdispl * extent, recv_count, dtype); + COLL_BASE_REDUCE(session, op, psend + rdispl * extent, + precv + rdispl * extent, recv_count, dtype); char *p = psend; psend = precv; precv = p; } else { /* psend = precv psend */ - ompi_op_reduce(op, precv + rdispl * extent, - psend + rdispl * extent, recv_count, dtype); + COLL_BASE_REDUCE(session, op, precv + rdispl * extent, + psend + rdispl * extent, recv_count, dtype); } send_index = recv_index; } @@ -899,9 +903,7 @@ ompi_coll_base_reduce_scatter_intra_butterfly( cleanup_and_return: if (displs) free(displs); - if (tmpbuf[0]) - free(tmpbuf[0]); - if (tmpbuf[1]) - free(tmpbuf[1]); + COLL_SESSION_FREE(session, tmpbuf[0]); + COLL_SESSION_FREE(session, tmpbuf[1]); return err; } diff --git a/ompi/mca/coll/base/coll_base_reduce_scatter_block.c b/ompi/mca/coll/base/coll_base_reduce_scatter_block.c index ca4a6989bec..19319b67ff8 100644 --- a/ompi/mca/coll/base/coll_base_reduce_scatter_block.c +++ b/ompi/mca/coll/base/coll_base_reduce_scatter_block.c @@ -38,6 +38,7 @@ #include "ompi/mca/coll/basic/coll_basic.h" #include "ompi/mca/pml/pml.h" #include "ompi/op/op.h" +#include "ompi/op/op_gpu_session.h" #include "coll_tags.h" #include "coll_base_functions.h" #include "coll_base_topo.h" @@ -59,7 +60,8 @@ ompi_coll_base_reduce_scatter_block_basic_linear(const void *sbuf, void *rbuf, s struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module, + ompi_op_gpu_session_t *session) { int rank, size, err = OMPI_SUCCESS; size_t count; @@ -101,7 +103,7 @@ ompi_coll_base_reduce_scatter_block_basic_linear(const void *sbuf, void *rbuf, s if (0 == rank) { /* temporary receive buffer. See coll_basic_reduce.c for details on sizing */ - recv_buf_free = (char*) malloc(span); + recv_buf_free = (char*) COLL_SESSION_ALLOC(session, span); if (NULL == recv_buf_free) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup; @@ -151,7 +153,7 @@ ompi_coll_base_reduce_scatter_block_basic_linear(const void *sbuf, void *rbuf, s if (0 == rank) { /* temporary receive buffer. See coll_basic_reduce.c for details on sizing */ - recv_buf_free = (char*) malloc(span); + recv_buf_free = (char*) COLL_SESSION_ALLOC(session, span); if (NULL == recv_buf_free) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup; @@ -174,7 +176,7 @@ ompi_coll_base_reduce_scatter_block_basic_linear(const void *sbuf, void *rbuf, s } cleanup: - if (NULL != recv_buf_free) free(recv_buf_free); + COLL_SESSION_FREE(session, recv_buf_free); return err; } @@ -198,7 +200,7 @@ int ompi_coll_base_reduce_scatter_block_intra_recursivedoubling( const void *sbuf, void *rbuf, size_t rcount, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module, ompi_op_gpu_session_t *session) { struct ompi_datatype_t *dtypesend = NULL, *dtyperecv = NULL; char *tmprecv_raw = NULL, *tmpbuf_raw = NULL, *tmprecv, *tmpbuf; @@ -225,12 +227,12 @@ ompi_coll_base_reduce_scatter_block_intra_recursivedoubling( * will overflow an int data type. * Fallback to the linear algorithm. */ - return ompi_coll_base_reduce_scatter_block_basic_linear(sbuf, rbuf, rcount, dtype, op, comm, module); + return ompi_coll_base_reduce_scatter_block_basic_linear(sbuf, rbuf, rcount, dtype, op, comm, module, session); } ompi_datatype_type_extent(dtype, &extent); span = opal_datatype_span(&dtype->super, totalcount, &gap); - tmpbuf_raw = malloc(span); - tmprecv_raw = malloc(span); + tmpbuf_raw = COLL_SESSION_ALLOC(session, span); + tmprecv_raw = COLL_SESSION_ALLOC(session, span); if (NULL == tmpbuf_raw || NULL == tmprecv_raw) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup_and_return; @@ -340,15 +342,15 @@ ompi_coll_base_reduce_scatter_block_intra_recursivedoubling( if (is_block_received) { /* After reduction the result must be in tmpbuf */ if (is_commutative || (remote_tree_root < cur_tree_root)) { - ompi_op_reduce(op, tmprecv, tmpbuf, blocklens[0], dtype); - ompi_op_reduce(op, tmprecv + (ptrdiff_t)displs[1] * extent, - tmpbuf + (ptrdiff_t)displs[1] * extent, - blocklens[1], dtype); + COLL_BASE_REDUCE(session, op, tmprecv, tmpbuf, blocklens[0], dtype); + COLL_BASE_REDUCE(session, op, tmprecv + (ptrdiff_t)displs[1] * extent, + tmpbuf + (ptrdiff_t)displs[1] * extent, + blocklens[1], dtype); } else { - ompi_op_reduce(op, tmpbuf, tmprecv, blocklens[0], dtype); - ompi_op_reduce(op, tmpbuf + (ptrdiff_t)displs[1] * extent, - tmprecv + (ptrdiff_t)displs[1] * extent, - blocklens[1], dtype); + COLL_BASE_REDUCE(session, op, tmpbuf, tmprecv, blocklens[0], dtype); + COLL_BASE_REDUCE(session, op, tmpbuf + (ptrdiff_t)displs[1] * extent, + tmprecv + (ptrdiff_t)displs[1] * extent, + blocklens[1], dtype); err = ompi_datatype_copy_content_same_ddt(dtyperecv, 1, tmpbuf, tmprecv); if (MPI_SUCCESS != err) { goto cleanup_and_return; } @@ -368,10 +370,8 @@ ompi_coll_base_reduce_scatter_block_intra_recursivedoubling( ompi_datatype_destroy(&dtypesend); if (dtyperecv) ompi_datatype_destroy(&dtyperecv); - if (tmpbuf_raw) - free(tmpbuf_raw); - if (tmprecv_raw) - free(tmprecv_raw); + COLL_SESSION_FREE(session, tmpbuf_raw); + COLL_SESSION_FREE(session, tmprecv_raw); return err; } @@ -406,7 +406,7 @@ int ompi_coll_base_reduce_scatter_block_intra_recursivehalving( const void *sbuf, void *rbuf, size_t rcount, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module, ompi_op_gpu_session_t *session) { char *tmprecv_raw = NULL, *tmpbuf_raw = NULL, *tmprecv, *tmpbuf; ptrdiff_t span, gap, totalcount, extent; @@ -425,14 +425,14 @@ ompi_coll_base_reduce_scatter_block_intra_recursivehalving( "coll:base:reduce_scatter_block_intra_recursivehalving: rank %d/%d " "switching to basic reduce_scatter_block", rank, comm_size)); return ompi_coll_base_reduce_scatter_block_basic_linear(sbuf, rbuf, rcount, dtype, - op, comm, module); + op, comm, module, session); } totalcount = comm_size * (size_t)rcount; ompi_datatype_type_extent(dtype, &extent); span = opal_datatype_span(&dtype->super, totalcount, &gap); - tmpbuf_raw = malloc(span); - tmprecv_raw = malloc(span); + tmpbuf_raw = COLL_SESSION_ALLOC(session, span); + tmprecv_raw = COLL_SESSION_ALLOC(session, span); if (NULL == tmpbuf_raw || NULL == tmprecv_raw) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup_and_return; @@ -482,7 +482,7 @@ ompi_coll_base_reduce_scatter_block_intra_recursivehalving( MCA_COLL_BASE_TAG_REDUCE_SCATTER_BLOCK, comm, MPI_STATUS_IGNORE)); if (OMPI_SUCCESS != err) { goto cleanup_and_return; } - ompi_op_reduce(op, tmprecv, tmpbuf, totalcount, dtype); + COLL_BASE_REDUCE(session, op, tmprecv, tmpbuf, totalcount, dtype); /* Adjust rank to be the bottom "remain" ranks */ vrank = rank / 2; } @@ -546,8 +546,8 @@ ompi_coll_base_reduce_scatter_block_intra_recursivehalving( if (recv_count > 0) { err = ompi_request_wait(&request, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != err) { goto cleanup_and_return; } - ompi_op_reduce(op, tmprecv + rdispl * extent, - tmpbuf + rdispl * extent, recv_count, dtype); + COLL_BASE_REDUCE(session, op, tmprecv + rdispl * extent, + tmpbuf + rdispl * extent, recv_count, dtype); } send_index = recv_index; last_index = recv_index + mask; @@ -576,17 +576,15 @@ ompi_coll_base_reduce_scatter_block_intra_recursivehalving( } cleanup_and_return: - if (tmpbuf_raw) - free(tmpbuf_raw); - if (tmprecv_raw) - free(tmprecv_raw); + COLL_SESSION_FREE(session, tmpbuf_raw); + COLL_SESSION_FREE(session, tmprecv_raw); return err; } static int ompi_coll_base_reduce_scatter_block_intra_butterfly_pof2( const void *sbuf, void *rbuf, size_t rcount, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); + mca_coll_base_module_t *module, ompi_op_gpu_session_t *session); /* * ompi_coll_base_reduce_scatter_block_intra_butterfly @@ -648,7 +646,7 @@ int ompi_coll_base_reduce_scatter_block_intra_butterfly( const void *sbuf, void *rbuf, size_t rcount, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module, ompi_op_gpu_session_t *session) { char *tmpbuf[2] = {NULL, NULL}, *psend, *precv; ptrdiff_t span, gap, totalcount, extent; @@ -665,14 +663,14 @@ ompi_coll_base_reduce_scatter_block_intra_butterfly( if (!(comm_size & (comm_size - 1))) { /* Special case: comm_size is a power of two */ return ompi_coll_base_reduce_scatter_block_intra_butterfly_pof2( - sbuf, rbuf, rcount, dtype, op, comm, module); + sbuf, rbuf, rcount, dtype, op, comm, module, session); } totalcount = comm_size * (size_t)rcount; ompi_datatype_type_extent(dtype, &extent); span = opal_datatype_span(&dtype->super, totalcount, &gap); - tmpbuf[0] = malloc(span); - tmpbuf[1] = malloc(span); + tmpbuf[0] = COLL_SESSION_ALLOC(session, span); + tmpbuf[1] = COLL_SESSION_ALLOC(session, span); if (NULL == tmpbuf[0] || NULL == tmpbuf[1]) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup_and_return; @@ -723,7 +721,7 @@ ompi_coll_base_reduce_scatter_block_intra_butterfly( MCA_COLL_BASE_TAG_REDUCE_SCATTER_BLOCK, comm, MPI_STATUS_IGNORE)); if (OMPI_SUCCESS != err) { goto cleanup_and_return; } - ompi_op_reduce(op, precv, psend, totalcount, dtype); + COLL_BASE_REDUCE(session, op, precv, psend, totalcount, dtype); /* Adjust rank to be the bottom "remain" ranks */ vrank = rank / 2; } @@ -780,15 +778,15 @@ ompi_coll_base_reduce_scatter_block_intra_butterfly( if (vrank < vpeer) { /* precv = psend precv */ - ompi_op_reduce(op, psend + (ptrdiff_t)rdispl * extent, - precv + (ptrdiff_t)rdispl * extent, recv_count, dtype); + COLL_BASE_REDUCE(session, op, psend + (ptrdiff_t)rdispl * extent, + precv + (ptrdiff_t)rdispl * extent, recv_count, dtype); char *p = psend; psend = precv; precv = p; } else { /* psend = precv psend */ - ompi_op_reduce(op, precv + (ptrdiff_t)rdispl * extent, - psend + (ptrdiff_t)rdispl * extent, recv_count, dtype); + COLL_BASE_REDUCE(session, op, precv + (ptrdiff_t)rdispl * extent, + psend + (ptrdiff_t)rdispl * extent, recv_count, dtype); } send_index = recv_index; } @@ -843,10 +841,8 @@ ompi_coll_base_reduce_scatter_block_intra_butterfly( } cleanup_and_return: - if (tmpbuf[0]) - free(tmpbuf[0]); - if (tmpbuf[1]) - free(tmpbuf[1]); + COLL_SESSION_FREE(session, tmpbuf[0]); + COLL_SESSION_FREE(session, tmpbuf[1]); return err; } @@ -895,7 +891,7 @@ static int ompi_coll_base_reduce_scatter_block_intra_butterfly_pof2( const void *sbuf, void *rbuf, size_t rcount, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module, ompi_op_gpu_session_t *session) { char *tmpbuf[2] = {NULL, NULL}, *psend, *precv; ptrdiff_t span, gap, totalcount, extent; @@ -909,8 +905,8 @@ ompi_coll_base_reduce_scatter_block_intra_butterfly_pof2( totalcount = comm_size * (size_t)rcount; ompi_datatype_type_extent(dtype, &extent); span = opal_datatype_span(&dtype->super, totalcount, &gap); - tmpbuf[0] = malloc(span); - tmpbuf[1] = malloc(span); + tmpbuf[0] = COLL_SESSION_ALLOC(session, span); + tmpbuf[1] = COLL_SESSION_ALLOC(session, span); if (NULL == tmpbuf[0] || NULL == tmpbuf[1]) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup_and_return; @@ -951,15 +947,15 @@ ompi_coll_base_reduce_scatter_block_intra_butterfly_pof2( if (rank < peer) { /* precv = psend precv */ - ompi_op_reduce(op, psend + (ptrdiff_t)recv_index * extent, - precv + (ptrdiff_t)recv_index * extent, nblocks, dtype); + COLL_BASE_REDUCE(session, op, psend + (ptrdiff_t)recv_index * extent, + precv + (ptrdiff_t)recv_index * extent, nblocks, dtype); char *p = psend; psend = precv; precv = p; } else { /* psend = precv psend */ - ompi_op_reduce(op, precv + (ptrdiff_t)recv_index * extent, - psend + (ptrdiff_t)recv_index * extent, nblocks, dtype); + COLL_BASE_REDUCE(session, op, precv + (ptrdiff_t)recv_index * extent, + psend + (ptrdiff_t)recv_index * extent, nblocks, dtype); } send_index = recv_index; } @@ -969,9 +965,7 @@ ompi_coll_base_reduce_scatter_block_intra_butterfly_pof2( if (MPI_SUCCESS != err) { goto cleanup_and_return; } cleanup_and_return: - if (tmpbuf[0]) - free(tmpbuf[0]); - if (tmpbuf[1]) - free(tmpbuf[1]); + COLL_SESSION_FREE(session, tmpbuf[0]); + COLL_SESSION_FREE(session, tmpbuf[1]); return err; } diff --git a/ompi/mca/coll/base/coll_base_scan.c b/ompi/mca/coll/base/coll_base_scan.c index 9ac99ed255e..3cf663432ee 100644 --- a/ompi/mca/coll/base/coll_base_scan.c +++ b/ompi/mca/coll/base/coll_base_scan.c @@ -23,6 +23,7 @@ #include "ompi/mca/coll/base/coll_base_util.h" #include "ompi/mca/pml/pml.h" #include "ompi/op/op.h" +#include "ompi/op/op_gpu_session.h" /* * ompi_coll_base_scan_intra_linear @@ -157,7 +158,7 @@ ompi_coll_base_scan_intra_linear(const void *sbuf, void *rbuf, size_t count, int ompi_coll_base_scan_intra_recursivedoubling( const void *sendbuf, void *recvbuf, size_t count, struct ompi_datatype_t *datatype, struct ompi_op_t *op, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module, ompi_op_gpu_session_t *session) { int err = MPI_SUCCESS; char *tmpsend_raw = NULL, *tmprecv_raw = NULL; @@ -179,8 +180,8 @@ int ompi_coll_base_scan_intra_recursivedoubling( ptrdiff_t dsize, gap; dsize = opal_datatype_span(&datatype->super, count, &gap); - tmpsend_raw = malloc(dsize); - tmprecv_raw = malloc(dsize); + tmpsend_raw = COLL_SESSION_ALLOC(session, dsize); + tmprecv_raw = COLL_SESSION_ALLOC(session, dsize); if (NULL == tmpsend_raw || NULL == tmprecv_raw) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup_and_return; @@ -203,16 +204,16 @@ int ompi_coll_base_scan_intra_recursivedoubling( if (rank > remote) { /* Accumulate prefix reduction: recvbuf = precv recvbuf */ - ompi_op_reduce(op, precv, recvbuf, count, datatype); + COLL_BASE_REDUCE(session, op, precv, recvbuf, count, datatype); /* Partial result: psend = precv psend */ - ompi_op_reduce(op, precv, psend, count, datatype); + COLL_BASE_REDUCE(session, op, precv, psend, count, datatype); } else { if (is_commute) { /* psend = precv psend */ - ompi_op_reduce(op, precv, psend, count, datatype); + COLL_BASE_REDUCE(session, op, precv, psend, count, datatype); } else { /* precv = psend precv */ - ompi_op_reduce(op, psend, precv, count, datatype); + COLL_BASE_REDUCE(session, op, psend, precv, count, datatype); char *tmp = psend; psend = precv; precv = tmp; @@ -222,9 +223,7 @@ int ompi_coll_base_scan_intra_recursivedoubling( } cleanup_and_return: - if (NULL != tmpsend_raw) - free(tmpsend_raw); - if (NULL != tmprecv_raw) - free(tmprecv_raw); + COLL_SESSION_FREE(session, tmpsend_raw); + COLL_SESSION_FREE(session, tmprecv_raw); return err; } diff --git a/ompi/mca/coll/base/coll_base_scatter.c b/ompi/mca/coll/base/coll_base_scatter.c index 795f79a5c72..87b799acc0e 100644 --- a/ompi/mca/coll/base/coll_base_scatter.c +++ b/ompi/mca/coll/base/coll_base_scatter.c @@ -64,7 +64,8 @@ ompi_coll_base_scatter_intra_binomial( const void *sbuf, size_t scount, struct ompi_datatype_t *sdtype, void *rbuf, size_t rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module, + mca_allocator_base_module_t *allocator) { mca_coll_base_module_t *base_module = (mca_coll_base_module_t*)module; mca_coll_base_comm_t *data = base_module->base_data; @@ -110,7 +111,7 @@ ompi_coll_base_scatter_intra_binomial( opal_convertor_get_packed_size( &convertor, &packed_sizet ); packed_size = packed_sizet; packed_sizet = packed_sizet / size; - ptmp = tempbuf = (char *)malloc(packed_size); + ptmp = tempbuf = (char *) COLL_BASE_ALLOC(allocator, packed_size); if (NULL == tempbuf) { err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; } @@ -147,7 +148,7 @@ ompi_coll_base_scatter_intra_binomial( subtree_size = size - vrank; packed_size = scount * subtree_size; - ptmp = tempbuf = (char *)malloc(packed_size); + ptmp = tempbuf = (char *) COLL_BASE_ALLOC(allocator, packed_size); if (NULL == tempbuf) { err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; } @@ -185,13 +186,13 @@ ompi_coll_base_scatter_intra_binomial( curr_count -= send_count; } if (NULL != tempbuf) - free(tempbuf); + COLL_BASE_FREE(allocator, tempbuf); return MPI_SUCCESS; err_hndl: if (NULL != tempbuf) - free(tempbuf); + COLL_BASE_FREE(allocator, tempbuf); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); diff --git a/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c b/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c index 5eb7adfda50..136255fe145 100644 --- a/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c +++ b/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c @@ -59,7 +59,7 @@ mca_coll_basic_reduce_scatter_block_intra(const void *sbuf, void *rbuf, size_t r struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { - return ompi_coll_base_reduce_scatter_block_basic_linear(sbuf, rbuf, rcount, dtype, op, comm, module); + return ompi_coll_base_reduce_scatter_block_basic_linear(sbuf, rbuf, rcount, dtype, op, comm, module, NULL); } /* diff --git a/ompi/mca/coll/tuned/coll_tuned.h b/ompi/mca/coll/tuned/coll_tuned.h index 53bb8705aa0..d0b9f82f288 100644 --- a/ompi/mca/coll/tuned/coll_tuned.h +++ b/ompi/mca/coll/tuned/coll_tuned.h @@ -26,6 +26,7 @@ #include "ompi/request/request.h" #include "ompi/mca/coll/base/coll_base_functions.h" #include "opal/util/output.h" +#include "ompi/op/op_gpu_session.h" /* also need the dynamic rule structures */ #include "coll_tuned_dynamic_rules.h" @@ -102,7 +103,7 @@ ompi_coll_tuned_comm_query(struct ompi_communicator_t *comm, int *priority); /* All Gather */ int ompi_coll_tuned_allgather_intra_dec_fixed(ALLGATHER_ARGS); int ompi_coll_tuned_allgather_intra_dec_dynamic(ALLGATHER_ARGS); -int ompi_coll_tuned_allgather_intra_do_this(ALLGATHER_ARGS, int algorithm, int faninout, int segsize); +int ompi_coll_tuned_allgather_intra_do_this(ALLGATHER_ARGS, int algorithm, int faninout, int segsize, mca_allocator_base_module_t *allocator); int ompi_coll_tuned_allgather_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices); /* All GatherV */ @@ -115,7 +116,7 @@ int ompi_coll_tuned_allgatherv_intra_check_forced_init(coll_tuned_force_algorith int ompi_coll_tuned_allreduce_intra_dec_fixed(ALLREDUCE_ARGS); int ompi_coll_tuned_allreduce_intra_disjoint_dec_fixed(ALLREDUCE_ARGS); int ompi_coll_tuned_allreduce_intra_dec_dynamic(ALLREDUCE_ARGS); -int ompi_coll_tuned_allreduce_intra_do_this(ALLREDUCE_ARGS, int algorithm, int faninout, int segsize); +int ompi_coll_tuned_allreduce_intra_do_this(ALLREDUCE_ARGS, int algorithm, int faninout, int segsize, ompi_op_gpu_session_t *session); int ompi_coll_tuned_allreduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices); /* AlltoAll */ @@ -146,43 +147,43 @@ int ompi_coll_tuned_bcast_intra_check_forced_init (coll_tuned_force_algorithm_mc /* Gather */ int ompi_coll_tuned_gather_intra_dec_fixed(GATHER_ARGS); int ompi_coll_tuned_gather_intra_dec_dynamic(GATHER_ARGS); -int ompi_coll_tuned_gather_intra_do_this(GATHER_ARGS, int algorithm, int faninout, int segsize); +int ompi_coll_tuned_gather_intra_do_this(GATHER_ARGS, int algorithm, int faninout, int segsize, mca_allocator_base_module_t *allocator); int ompi_coll_tuned_gather_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices); /* Reduce */ int ompi_coll_tuned_reduce_intra_dec_fixed(REDUCE_ARGS); int ompi_coll_tuned_reduce_intra_dec_dynamic(REDUCE_ARGS); -int ompi_coll_tuned_reduce_intra_do_this(REDUCE_ARGS, int algorithm, int faninout, int segsize, int max_oustanding_reqs); +int ompi_coll_tuned_reduce_intra_do_this(REDUCE_ARGS, int algorithm, int faninout, int segsize, int max_oustanding_reqs, ompi_op_gpu_session_t *session); int ompi_coll_tuned_reduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices); /* Reduce_scatter */ int ompi_coll_tuned_reduce_scatter_intra_dec_fixed(REDUCESCATTER_ARGS); int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(REDUCESCATTER_ARGS); -int ompi_coll_tuned_reduce_scatter_intra_do_this(REDUCESCATTER_ARGS, int algorithm, int faninout, int segsize); +int ompi_coll_tuned_reduce_scatter_intra_do_this(REDUCESCATTER_ARGS, int algorithm, int faninout, int segsize, ompi_op_gpu_session_t *session); int ompi_coll_tuned_reduce_scatter_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices); /* Reduce_scatter_block */ int ompi_coll_tuned_reduce_scatter_block_intra_dec_fixed(REDUCESCATTERBLOCK_ARGS); int ompi_coll_tuned_reduce_scatter_block_intra_dec_dynamic(REDUCESCATTERBLOCK_ARGS); -int ompi_coll_tuned_reduce_scatter_block_intra_do_this(REDUCESCATTERBLOCK_ARGS, int algorithm, int faninout, int segsize); +int ompi_coll_tuned_reduce_scatter_block_intra_do_this(REDUCESCATTERBLOCK_ARGS, int algorithm, int faninout, int segsize, ompi_op_gpu_session_t *session); int ompi_coll_tuned_reduce_scatter_block_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices); /* Scatter */ int ompi_coll_tuned_scatter_intra_dec_fixed(SCATTER_ARGS); int ompi_coll_tuned_scatter_intra_dec_dynamic(SCATTER_ARGS); -int ompi_coll_tuned_scatter_intra_do_this(SCATTER_ARGS, int algorithm, int faninout, int segsize); +int ompi_coll_tuned_scatter_intra_do_this(SCATTER_ARGS, int algorithm, int faninout, int segsize, mca_allocator_base_module_t *allocator); int ompi_coll_tuned_scatter_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices); /* Exscan */ int ompi_coll_tuned_exscan_intra_dec_fixed(EXSCAN_ARGS); int ompi_coll_tuned_exscan_intra_dec_dynamic(EXSCAN_ARGS); -int ompi_coll_tuned_exscan_intra_do_this(EXSCAN_ARGS, int algorithm); +int ompi_coll_tuned_exscan_intra_do_this(EXSCAN_ARGS, int algorithm, ompi_op_gpu_session_t *session); int ompi_coll_tuned_exscan_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices); /* Scan */ int ompi_coll_tuned_scan_intra_dec_fixed(SCAN_ARGS); int ompi_coll_tuned_scan_intra_dec_dynamic(SCAN_ARGS); -int ompi_coll_tuned_scan_intra_do_this(SCAN_ARGS, int algorithm); +int ompi_coll_tuned_scan_intra_do_this(SCAN_ARGS, int algorithm, ompi_op_gpu_session_t *session); int ompi_coll_tuned_scan_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices); struct mca_coll_tuned_component_t { diff --git a/ompi/mca/coll/tuned/coll_tuned_allgather_decision.c b/ompi/mca/coll/tuned/coll_tuned_allgather_decision.c index 052c1d5f9e4..4e9d167f79a 100644 --- a/ompi/mca/coll/tuned/coll_tuned_allgather_decision.c +++ b/ompi/mca/coll/tuned/coll_tuned_allgather_decision.c @@ -133,7 +133,8 @@ int ompi_coll_tuned_allgather_intra_do_this(const void *sbuf, size_t scount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module, - int algorithm, int faninout, int segsize) + int algorithm, int faninout, int segsize, + mca_allocator_base_module_t *allocator) { OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "coll:tuned:allgather_intra_do_this selected algorithm %d topo faninout %d segsize %d", @@ -150,7 +151,7 @@ int ompi_coll_tuned_allgather_intra_do_this(const void *sbuf, size_t scount, case (2): return ompi_coll_base_allgather_intra_k_bruck(sbuf, scount, sdtype, rbuf, rcount, rdtype, - comm, module, faninout); + comm, module, faninout, allocator); case (3): return ompi_coll_base_allgather_intra_recursivedoubling(sbuf, scount, sdtype, rbuf, rcount, rdtype, diff --git a/ompi/mca/coll/tuned/coll_tuned_allreduce_decision.c b/ompi/mca/coll/tuned/coll_tuned_allreduce_decision.c index 9a63d8c5abb..113779c90b7 100644 --- a/ompi/mca/coll/tuned/coll_tuned_allreduce_decision.c +++ b/ompi/mca/coll/tuned/coll_tuned_allreduce_decision.c @@ -130,7 +130,8 @@ int ompi_coll_tuned_allreduce_intra_do_this(const void *sbuf, void *rbuf, size_t struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module, - int algorithm, int faninout, int segsize) + int algorithm, int faninout, int segsize, + ompi_op_gpu_session_t *session) { OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "coll:tuned:allreduce_intra_do_this algorithm %d topo fan in/out %d segsize %d", @@ -144,15 +145,15 @@ int ompi_coll_tuned_allreduce_intra_do_this(const void *sbuf, void *rbuf, size_t case (2): return ompi_coll_base_allreduce_intra_nonoverlapping(sbuf, rbuf, count, dtype, op, comm, module); case (3): - return ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf, count, dtype, op, comm, module); + return ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf, count, dtype, op, comm, module, session); case (4): - return ompi_coll_base_allreduce_intra_ring(sbuf, rbuf, count, dtype, op, comm, module); + return ompi_coll_base_allreduce_intra_ring(sbuf, rbuf, count, dtype, op, comm, module, session); case (5): - return ompi_coll_base_allreduce_intra_ring_segmented(sbuf, rbuf, count, dtype, op, comm, module, segsize); + return ompi_coll_base_allreduce_intra_ring_segmented(sbuf, rbuf, count, dtype, op, comm, module, segsize, session); case (6): - return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype, op, comm, module); + return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype, op, comm, module, session); case (7): - return ompi_coll_base_allreduce_intra_allgather_reduce(sbuf, rbuf, count, dtype, op, comm, module); + return ompi_coll_base_allreduce_intra_allgather_reduce(sbuf, rbuf, count, dtype, op, comm, module, session); } /* switch */ OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "coll:tuned:allreduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", diff --git a/ompi/mca/coll/tuned/coll_tuned_decision_dynamic.c b/ompi/mca/coll/tuned/coll_tuned_decision_dynamic.c index 5d6a699b301..17d6786d949 100644 --- a/ompi/mca/coll/tuned/coll_tuned_decision_dynamic.c +++ b/ompi/mca/coll/tuned/coll_tuned_decision_dynamic.c @@ -25,7 +25,11 @@ #include "mpi.h" #include "ompi/constants.h" +#include "opal/mca/accelerator/accelerator.h" +#include "opal/mca/accelerator/base/base.h" #include "ompi/datatype/ompi_datatype.h" +#include "ompi/op/op.h" +#include "ompi/op/op_gpu_session.h" #include "ompi/communicator/communicator.h" #include "ompi/mca/coll/base/base.h" #include "ompi/mca/coll/coll.h" @@ -65,10 +69,20 @@ ompi_coll_tuned_allreduce_intra_dec_dynamic (const void *sbuf, void *rbuf, size_ /* Check first if an algorithm is set explicitly for this collective */ if (tuned_module->user_forced[ALLREDUCE].algorithm) { - return ompi_coll_tuned_allreduce_intra_do_this(sbuf, rbuf, count, dtype, op, comm, module, - tuned_module->user_forced[ALLREDUCE].algorithm, - tuned_module->user_forced[ALLREDUCE].tree_fanout, - tuned_module->user_forced[ALLREDUCE].segsize); + ompi_op_gpu_session_t *session = NULL; + int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID; + uint64_t _flags; + if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) || + opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) { + session = ompi_op_gpu_session_begin(op, dtype, _dev_id); + } + int rc = ompi_coll_tuned_allreduce_intra_do_this(sbuf, rbuf, count, dtype, op, comm, module, + tuned_module->user_forced[ALLREDUCE].algorithm, + tuned_module->user_forced[ALLREDUCE].tree_fanout, + tuned_module->user_forced[ALLREDUCE].segsize, + session); + ompi_op_gpu_session_end(session); + return rc; } /* check to see if we have some filebased rules */ @@ -84,10 +98,18 @@ ompi_coll_tuned_allreduce_intra_dec_dynamic (const void *sbuf, void *rbuf, size_ dsize, &faninout, &segsize, &ignoreme); if (alg) { - /* we have found a valid choice from the file based rules for this message size */ - return ompi_coll_tuned_allreduce_intra_do_this (sbuf, rbuf, count, dtype, op, - comm, module, - alg, faninout, segsize); + ompi_op_gpu_session_t *session = NULL; + int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID; + uint64_t _flags; + if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) || + opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) { + session = ompi_op_gpu_session_begin(op, dtype, _dev_id); + } + int rc = ompi_coll_tuned_allreduce_intra_do_this(sbuf, rbuf, count, dtype, op, + comm, module, + alg, faninout, segsize, session); + ompi_op_gpu_session_end(session); + return rc; } /* found a method */ } /*end if any com rules to check */ @@ -319,12 +341,22 @@ int ompi_coll_tuned_reduce_intra_dec_dynamic( const void *sbuf, void *rbuf, /* Check first if an algorithm is set explicitly for this collective */ if (tuned_module->user_forced[REDUCE].algorithm) { - return ompi_coll_tuned_reduce_intra_do_this(sbuf, rbuf, count, dtype, - op, root, comm, module, - tuned_module->user_forced[REDUCE].algorithm, - tuned_module->user_forced[REDUCE].chain_fanout, - tuned_module->user_forced[REDUCE].segsize, - tuned_module->user_forced[REDUCE].max_requests); + ompi_op_gpu_session_t *session = NULL; + int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID; + uint64_t _flags; + if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) || + opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) { + session = ompi_op_gpu_session_begin(op, dtype, _dev_id); + } + int rc = ompi_coll_tuned_reduce_intra_do_this(sbuf, rbuf, count, dtype, + op, root, comm, module, + tuned_module->user_forced[REDUCE].algorithm, + tuned_module->user_forced[REDUCE].chain_fanout, + tuned_module->user_forced[REDUCE].segsize, + tuned_module->user_forced[REDUCE].max_requests, + session); + ompi_op_gpu_session_end(session); + return rc; } /* check to see if we have some filebased rules */ @@ -341,11 +373,19 @@ int ompi_coll_tuned_reduce_intra_dec_dynamic( const void *sbuf, void *rbuf, dsize, &faninout, &segsize, &max_requests); if (alg) { - /* we have found a valid choice from the file based rules for this message size */ - return ompi_coll_tuned_reduce_intra_do_this (sbuf, rbuf, count, dtype, + ompi_op_gpu_session_t *session = NULL; + int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID; + uint64_t _flags; + if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) || + opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) { + session = ompi_op_gpu_session_begin(op, dtype, _dev_id); + } + int rc = ompi_coll_tuned_reduce_intra_do_this(sbuf, rbuf, count, dtype, op, root, comm, module, alg, faninout, - segsize, max_requests); + segsize, max_requests, session); + ompi_op_gpu_session_end(session); + return rc; } /* found a method */ } /*end if any com rules to check */ @@ -376,11 +416,21 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(const void *sbuf, void *rbu /* Check first if an algorithm is set explicitly for this collective */ if (tuned_module->user_forced[REDUCESCATTER].algorithm) { - return ompi_coll_tuned_reduce_scatter_intra_do_this(sbuf, rbuf, rcounts, dtype, - op, comm, module, - tuned_module->user_forced[REDUCESCATTER].algorithm, - tuned_module->user_forced[REDUCESCATTER].chain_fanout, - tuned_module->user_forced[REDUCESCATTER].segsize); + ompi_op_gpu_session_t *session = NULL; + int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID; + uint64_t _flags; + if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) || + opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) { + session = ompi_op_gpu_session_begin(op, dtype, _dev_id); + } + int rc = ompi_coll_tuned_reduce_scatter_intra_do_this(sbuf, rbuf, rcounts, dtype, + op, comm, module, + tuned_module->user_forced[REDUCESCATTER].algorithm, + tuned_module->user_forced[REDUCESCATTER].chain_fanout, + tuned_module->user_forced[REDUCESCATTER].segsize, + session); + ompi_op_gpu_session_end(session); + return rc; } /* check to see if we have some filebased rules */ @@ -398,10 +448,18 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(const void *sbuf, void *rbu dsize, &faninout, &segsize, &ignoreme); if (alg) { - /* we have found a valid choice from the file based rules for this message size */ - return ompi_coll_tuned_reduce_scatter_intra_do_this (sbuf, rbuf, rcounts, dtype, + ompi_op_gpu_session_t *session = NULL; + int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID; + uint64_t _flags; + if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) || + opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) { + session = ompi_op_gpu_session_begin(op, dtype, _dev_id); + } + int rc = ompi_coll_tuned_reduce_scatter_intra_do_this(sbuf, rbuf, rcounts, dtype, op, comm, module, - alg, faninout, segsize); + alg, faninout, segsize, session); + ompi_op_gpu_session_end(session); + return rc; } /* found a method */ } /*end if any com rules to check */ @@ -432,11 +490,21 @@ int ompi_coll_tuned_reduce_scatter_block_intra_dec_dynamic(const void *sbuf, voi /* Check first if an algorithm is set explicitly for this collective */ if (tuned_module->user_forced[REDUCESCATTERBLOCK].algorithm) { - return ompi_coll_tuned_reduce_scatter_block_intra_do_this(sbuf, rbuf, rcount, dtype, - op, comm, module, - tuned_module->user_forced[REDUCESCATTERBLOCK].algorithm, - tuned_module->user_forced[REDUCESCATTERBLOCK].chain_fanout, - tuned_module->user_forced[REDUCESCATTERBLOCK].segsize); + ompi_op_gpu_session_t *session = NULL; + int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID; + uint64_t _flags; + if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) || + opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) { + session = ompi_op_gpu_session_begin(op, dtype, _dev_id); + } + int rc = ompi_coll_tuned_reduce_scatter_block_intra_do_this(sbuf, rbuf, rcount, dtype, + op, comm, module, + tuned_module->user_forced[REDUCESCATTERBLOCK].algorithm, + tuned_module->user_forced[REDUCESCATTERBLOCK].chain_fanout, + tuned_module->user_forced[REDUCESCATTERBLOCK].segsize, + session); + ompi_op_gpu_session_end(session); + return rc; } /* check to see if we have some filebased rules */ @@ -453,10 +521,18 @@ int ompi_coll_tuned_reduce_scatter_block_intra_dec_dynamic(const void *sbuf, voi dsize, &faninout, &segsize, &ignoreme); if (alg) { - /* we have found a valid choice from the file based rules for this message size */ - return ompi_coll_tuned_reduce_scatter_block_intra_do_this (sbuf, rbuf, rcount, dtype, + ompi_op_gpu_session_t *session = NULL; + int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID; + uint64_t _flags; + if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) || + opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) { + session = ompi_op_gpu_session_begin(op, dtype, _dev_id); + } + int rc = ompi_coll_tuned_reduce_scatter_block_intra_do_this(sbuf, rbuf, rcount, dtype, op, comm, module, - alg, faninout, segsize); + alg, faninout, segsize, session); + ompi_op_gpu_session_end(session); + return rc; } /* found a method */ } /* end if any com rules to check */ @@ -487,13 +563,21 @@ int ompi_coll_tuned_allgather_intra_dec_dynamic(const void *sbuf, size_t scount, /* Check first if an algorithm is set explicitly for this collective */ if (tuned_module->user_forced[ALLGATHER].algorithm) { - /* User-forced algorithm */ + mca_allocator_base_module_t *allocator = NULL; + int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID; + uint64_t _flags; + if ((sbuf != MPI_IN_PLACE && + opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) || + opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) { + allocator = opal_accelerator_base_get_device_allocator(_dev_id); + } return ompi_coll_tuned_allgather_intra_do_this(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module, tuned_module->user_forced[ALLGATHER].algorithm, tuned_module->user_forced[ALLGATHER].tree_fanout, - tuned_module->user_forced[ALLGATHER].segsize); + tuned_module->user_forced[ALLGATHER].segsize, + allocator); } if (tuned_module->com_rules[ALLGATHER]) { @@ -510,12 +594,18 @@ int ompi_coll_tuned_allgather_intra_dec_dynamic(const void *sbuf, size_t scount, alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[ALLGATHER], dsize, &faninout, &segsize, &ignoreme); if (alg) { - /* we have found a valid choice from the file based rules for - this message size */ - return ompi_coll_tuned_allgather_intra_do_this (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, module, - alg, faninout, segsize); + mca_allocator_base_module_t *allocator = NULL; + int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID; + uint64_t _flags; + if ((sbuf != MPI_IN_PLACE && + opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) || + opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) { + allocator = opal_accelerator_base_get_device_allocator(_dev_id); + } + return ompi_coll_tuned_allgather_intra_do_this(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm, module, + alg, faninout, segsize, allocator); } } @@ -604,6 +694,20 @@ int ompi_coll_tuned_gather_intra_dec_dynamic(const void *sbuf, size_t scount, OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "ompi_coll_tuned_gather_intra_dec_dynamic")); + mca_allocator_base_module_t *allocator = NULL; + + /* Scratch buffer is used for data movement only (no ompi_op_reduce). + * Use device allocator when user buffers are on device. */ + { + int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID; + uint64_t _flags; + if ((sbuf != MPI_IN_PLACE && + opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) || + opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) { + allocator = opal_accelerator_base_get_device_allocator(_dev_id); + } + } + /* Check first if an algorithm is set explicitly for this collective */ if (tuned_module->user_forced[GATHER].algorithm) { return ompi_coll_tuned_gather_intra_do_this(sbuf, scount, sdtype, @@ -611,7 +715,8 @@ int ompi_coll_tuned_gather_intra_dec_dynamic(const void *sbuf, size_t scount, root, comm, module, tuned_module->user_forced[GATHER].algorithm, tuned_module->user_forced[GATHER].tree_fanout, - tuned_module->user_forced[GATHER].segsize); + tuned_module->user_forced[GATHER].segsize, + allocator); } /** @@ -629,11 +734,10 @@ int ompi_coll_tuned_gather_intra_dec_dynamic(const void *sbuf, size_t scount, dsize, &faninout, &segsize, &max_requests); if (alg) { - /* we have found a valid choice from the file based rules for this message size */ - return ompi_coll_tuned_gather_intra_do_this (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, module, - alg, faninout, segsize); + return ompi_coll_tuned_gather_intra_do_this(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, module, + alg, faninout, segsize, allocator); } /* found a method */ } /*end if any com rules to check */ @@ -654,6 +758,20 @@ int ompi_coll_tuned_scatter_intra_dec_dynamic(const void *sbuf, size_t scount, OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "ompi_coll_tuned_scatter_intra_dec_dynamic")); + mca_allocator_base_module_t *allocator = NULL; + + /* Scratch buffer is used for data movement only (no ompi_op_reduce). + * Use device allocator when user buffers are on device. */ + { + int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID; + uint64_t _flags; + if ((sbuf != MPI_IN_PLACE && + opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) || + opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) { + allocator = opal_accelerator_base_get_device_allocator(_dev_id); + } + } + /* Check first if an algorithm is set explicitly for this collective */ if (tuned_module->user_forced[SCATTER].algorithm) { return ompi_coll_tuned_scatter_intra_do_this(sbuf, scount, sdtype, @@ -661,7 +779,8 @@ int ompi_coll_tuned_scatter_intra_dec_dynamic(const void *sbuf, size_t scount, root, comm, module, tuned_module->user_forced[SCATTER].algorithm, tuned_module->user_forced[SCATTER].chain_fanout, - tuned_module->user_forced[SCATTER].segsize); + tuned_module->user_forced[SCATTER].segsize, + allocator); } /** @@ -679,11 +798,10 @@ int ompi_coll_tuned_scatter_intra_dec_dynamic(const void *sbuf, size_t scount, dsize, &faninout, &segsize, &max_requests); if (alg) { - /* we have found a valid choice from the file based rules for this message size */ - return ompi_coll_tuned_scatter_intra_do_this (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, module, - alg, faninout, segsize); + return ompi_coll_tuned_scatter_intra_do_this(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, module, + alg, faninout, segsize, allocator); } /* found a method */ } /*end if any com rules to check */ @@ -705,9 +823,19 @@ int ompi_coll_tuned_exscan_intra_dec_dynamic(const void *sbuf, void* rbuf, size_ /* Check first if an algorithm is set explicitly for this collective */ if (tuned_module->user_forced[EXSCAN].algorithm) { - return ompi_coll_tuned_exscan_intra_do_this(sbuf, rbuf, count, dtype, - op, comm, module, - tuned_module->user_forced[EXSCAN].algorithm); + ompi_op_gpu_session_t *session = NULL; + int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID; + uint64_t _flags; + if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) || + opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) { + session = ompi_op_gpu_session_begin(op, dtype, _dev_id); + } + int rc = ompi_coll_tuned_exscan_intra_do_this(sbuf, rbuf, count, dtype, + op, comm, module, + tuned_module->user_forced[EXSCAN].algorithm, + session); + ompi_op_gpu_session_end(session); + return rc; } /** @@ -725,10 +853,18 @@ int ompi_coll_tuned_exscan_intra_dec_dynamic(const void *sbuf, void* rbuf, size_ dsize, &faninout, &segsize, &max_requests); if (alg) { - /* we have found a valid choice from the file based rules for this message size */ - return ompi_coll_tuned_exscan_intra_do_this (sbuf, rbuf, count, dtype, - op, comm, module, - alg); + ompi_op_gpu_session_t *session = NULL; + int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID; + uint64_t _flags; + if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) || + opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) { + session = ompi_op_gpu_session_begin(op, dtype, _dev_id); + } + int rc = ompi_coll_tuned_exscan_intra_do_this(sbuf, rbuf, count, dtype, + op, comm, module, + alg, session); + ompi_op_gpu_session_end(session); + return rc; } /* found a method */ } /*end if any com rules to check */ @@ -749,9 +885,19 @@ int ompi_coll_tuned_scan_intra_dec_dynamic(const void *sbuf, void* rbuf, size_t /* Check first if an algorithm is set explicitly for this collective */ if (tuned_module->user_forced[SCAN].algorithm) { - return ompi_coll_tuned_scan_intra_do_this(sbuf, rbuf, count, dtype, - op, comm, module, - tuned_module->user_forced[SCAN].algorithm); + ompi_op_gpu_session_t *session = NULL; + int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID; + uint64_t _flags; + if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) || + opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) { + session = ompi_op_gpu_session_begin(op, dtype, _dev_id); + } + int rc = ompi_coll_tuned_scan_intra_do_this(sbuf, rbuf, count, dtype, + op, comm, module, + tuned_module->user_forced[SCAN].algorithm, + session); + ompi_op_gpu_session_end(session); + return rc; } /** @@ -769,10 +915,18 @@ int ompi_coll_tuned_scan_intra_dec_dynamic(const void *sbuf, void* rbuf, size_t dsize, &faninout, &segsize, &max_requests); if (alg) { - /* we have found a valid choice from the file based rules for this message size */ - return ompi_coll_tuned_scan_intra_do_this (sbuf, rbuf, count, dtype, - op, comm, module, - alg); + ompi_op_gpu_session_t *session = NULL; + int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID; + uint64_t _flags; + if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) || + opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) { + session = ompi_op_gpu_session_begin(op, dtype, _dev_id); + } + int rc = ompi_coll_tuned_scan_intra_do_this(sbuf, rbuf, count, dtype, + op, comm, module, + alg, session); + ompi_op_gpu_session_end(session); + return rc; } /* found a method */ } /*end if any com rules to check */ diff --git a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c index 3b0077c9bcc..c8299c68a66 100644 --- a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c +++ b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c @@ -29,11 +29,14 @@ #include "mpi.h" #include "opal/util/bit_ops.h" +#include "opal/mca/accelerator/accelerator.h" +#include "opal/mca/accelerator/base/base.h" #include "ompi/datatype/ompi_datatype.h" #include "ompi/communicator/communicator.h" #include "ompi/mca/coll/coll.h" #include "ompi/mca/coll/base/coll_tags.h" #include "ompi/op/op.h" +#include "ompi/op/op_gpu_session.h" #include "coll_tuned.h" /* @@ -214,8 +217,19 @@ ompi_coll_tuned_allreduce_intra_dec_fixed(const void *sbuf, void *rbuf, size_t c } } - return ompi_coll_tuned_allreduce_intra_do_this (sbuf, rbuf, count, dtype, op, - comm, module, alg, 0, 0); + { + ompi_op_gpu_session_t *session = NULL; + int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID; + uint64_t _flags; + if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) || + opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) { + session = ompi_op_gpu_session_begin(op, dtype, _dev_id); + } + int rc = ompi_coll_tuned_allreduce_intra_do_this(sbuf, rbuf, count, dtype, op, + comm, module, alg, 0, 0, session); + ompi_op_gpu_session_end(session); + return rc; + } } @@ -402,7 +416,7 @@ ompi_coll_tuned_allreduce_intra_disjoint_dec_fixed(const void *sbuf, void *rbuf, } return ompi_coll_tuned_allreduce_intra_do_this (sbuf, rbuf, count, dtype, op, - comm, module, alg, 0, 0); + comm, module, alg, 0, 0, NULL); } @@ -1073,10 +1087,21 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( const void *sendbuf, void *recvbuf, } } - int faninout = 2; - return ompi_coll_tuned_reduce_intra_do_this (sendbuf, recvbuf, count, datatype, - op, root, comm, module, - alg, faninout, 0, 0); + { + ompi_op_gpu_session_t *session = NULL; + int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID; + uint64_t _flags; + if ((sendbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sendbuf, &_dev_id, &_flags) > 0) || + opal_accelerator.check_addr(recvbuf, &_dev_id, &_flags) > 0) { + session = ompi_op_gpu_session_begin(op, datatype, _dev_id); + } + int faninout = 2; + int rc = ompi_coll_tuned_reduce_intra_do_this(sendbuf, recvbuf, count, datatype, + op, root, comm, module, + alg, faninout, 0, 0, session); + ompi_op_gpu_session_end(session); + return rc; + } } /* @@ -1223,9 +1248,20 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_fixed( const void *sbuf, void *rbuf } } - return ompi_coll_tuned_reduce_scatter_intra_do_this (sbuf, rbuf, rcounts, dtype, - op, comm, module, - alg, 0, 0); + { + ompi_op_gpu_session_t *session = NULL; + int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID; + uint64_t _flags; + if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) || + opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) { + session = ompi_op_gpu_session_begin(op, dtype, _dev_id); + } + int rc = ompi_coll_tuned_reduce_scatter_intra_do_this(sbuf, rbuf, rcounts, dtype, + op, comm, module, + alg, 0, 0, session); + ompi_op_gpu_session_end(session); + return rc; + } } /* @@ -1344,9 +1380,20 @@ int ompi_coll_tuned_reduce_scatter_block_intra_dec_fixed(const void *sbuf, void } } - return ompi_coll_tuned_reduce_scatter_block_intra_do_this (sbuf, rbuf, rcount, dtype, - op, comm, module, - alg, 0, 0); + { + ompi_op_gpu_session_t *session = NULL; + int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID; + uint64_t _flags; + if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) || + opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) { + session = ompi_op_gpu_session_begin(op, dtype, _dev_id); + } + int rc = ompi_coll_tuned_reduce_scatter_block_intra_do_this(sbuf, rbuf, rcount, dtype, + op, comm, module, + alg, 0, 0, session); + ompi_op_gpu_session_end(session); + return rc; + } } /* @@ -1491,10 +1538,21 @@ int ompi_coll_tuned_allgather_intra_dec_fixed(const void *sbuf, size_t scount, "ompi_coll_tuned_allgather_intra_dec_fixed rank %d com_size %d", ompi_comm_rank(comm), communicator_size)); - int faninout = 2; - return ompi_coll_tuned_allgather_intra_do_this(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, module, alg, faninout, 0); + { + mca_allocator_base_module_t *allocator = NULL; + int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID; + uint64_t _flags; + if ((sbuf != MPI_IN_PLACE && + opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) || + opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) { + allocator = opal_accelerator_base_get_device_allocator(_dev_id); + } + int faninout = 2; + return ompi_coll_tuned_allgather_intra_do_this(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm, module, alg, faninout, 0, + allocator); + } } /* @@ -1720,10 +1778,20 @@ int ompi_coll_tuned_gather_intra_dec_fixed(const void *sbuf, size_t scount, alg = 2; } - return ompi_coll_tuned_gather_intra_do_this (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, module, - alg, 0, 0); + { + mca_allocator_base_module_t *allocator = NULL; + int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID; + uint64_t _flags; + if ((sbuf != MPI_IN_PLACE && + opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) || + opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) { + allocator = opal_accelerator_base_get_device_allocator(_dev_id); + } + return ompi_coll_tuned_gather_intra_do_this(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, module, + alg, 0, 0, allocator); + } } /* @@ -1825,8 +1893,18 @@ int ompi_coll_tuned_scatter_intra_dec_fixed(const void *sbuf, size_t scount, } } - return ompi_coll_tuned_scatter_intra_do_this (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, module, - alg, 0, 0); + { + mca_allocator_base_module_t *allocator = NULL; + int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID; + uint64_t _flags; + if ((sbuf != MPI_IN_PLACE && + opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) || + opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) { + allocator = opal_accelerator_base_get_device_allocator(_dev_id); + } + return ompi_coll_tuned_scatter_intra_do_this(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, module, + alg, 0, 0, allocator); + } } diff --git a/ompi/mca/coll/tuned/coll_tuned_exscan_decision.c b/ompi/mca/coll/tuned/coll_tuned_exscan_decision.c index 48288c5d7d7..50f16dc27cd 100644 --- a/ompi/mca/coll/tuned/coll_tuned_exscan_decision.c +++ b/ompi/mca/coll/tuned/coll_tuned_exscan_decision.c @@ -93,7 +93,7 @@ int ompi_coll_tuned_exscan_intra_do_this(const void *sbuf, void* rbuf, size_t co struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module, - int algorithm) + int algorithm, ompi_op_gpu_session_t *session) { OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "coll:tuned:exscan_intra_do_this selected algorithm %d", @@ -104,7 +104,7 @@ int ompi_coll_tuned_exscan_intra_do_this(const void *sbuf, void* rbuf, size_t co case (1): return ompi_coll_base_exscan_intra_linear(sbuf, rbuf, count, dtype, op, comm, module); case (2): return ompi_coll_base_exscan_intra_recursivedoubling(sbuf, rbuf, count, dtype, - op, comm, module); + op, comm, module, session); } /* switch */ OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "coll:tuned:exscan_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", diff --git a/ompi/mca/coll/tuned/coll_tuned_gather_decision.c b/ompi/mca/coll/tuned/coll_tuned_gather_decision.c index d356202a3bf..1845f32a00b 100644 --- a/ompi/mca/coll/tuned/coll_tuned_gather_decision.c +++ b/ompi/mca/coll/tuned/coll_tuned_gather_decision.c @@ -130,7 +130,8 @@ ompi_coll_tuned_gather_intra_do_this(const void *sbuf, size_t scount, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module, - int algorithm, int faninout, int segsize) + int algorithm, int faninout, int segsize, + mca_allocator_base_module_t *allocator) { OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "coll:tuned:gather_intra_do_this selected algorithm %d topo faninout %d segsize %d", @@ -148,7 +149,7 @@ ompi_coll_tuned_gather_intra_do_this(const void *sbuf, size_t scount, case (2): return ompi_coll_base_gather_intra_binomial(sbuf, scount, sdtype, rbuf, rcount, rdtype, - root, comm, module); + root, comm, module, allocator); case (3): return ompi_coll_base_gather_intra_linear_sync(sbuf, scount, sdtype, rbuf, rcount, rdtype, diff --git a/ompi/mca/coll/tuned/coll_tuned_reduce_decision.c b/ompi/mca/coll/tuned/coll_tuned_reduce_decision.c index 6ae3c00f7d9..f935a680116 100644 --- a/ompi/mca/coll/tuned/coll_tuned_reduce_decision.c +++ b/ompi/mca/coll/tuned/coll_tuned_reduce_decision.c @@ -154,7 +154,8 @@ int ompi_coll_tuned_reduce_intra_do_this(const void *sbuf, void* rbuf, size_t co struct ompi_communicator_t *comm, mca_coll_base_module_t *module, int algorithm, int faninout, - int segsize, int max_requests ) + int segsize, int max_requests, + ompi_op_gpu_session_t *session) { OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "coll:tuned:reduce_intra_do_this selected algorithm %d topo faninout %d segsize %d", @@ -167,25 +168,31 @@ int ompi_coll_tuned_reduce_intra_do_this(const void *sbuf, void* rbuf, size_t co op, root, comm, module); case (2): return ompi_coll_base_reduce_intra_chain(sbuf, rbuf, count, dtype, op, root, comm, module, - segsize, faninout, max_requests); + segsize, faninout, max_requests, + session); case (3): return ompi_coll_base_reduce_intra_pipeline(sbuf, rbuf, count, dtype, op, root, comm, module, - segsize, max_requests); + segsize, max_requests, + session); case (4): return ompi_coll_base_reduce_intra_binary(sbuf, rbuf, count, dtype, op, root, comm, module, - segsize, max_requests); + segsize, max_requests, + session); case (5): return ompi_coll_base_reduce_intra_binomial(sbuf, rbuf, count, dtype, op, root, comm, module, - segsize, max_requests); + segsize, max_requests, + session); case (6): return ompi_coll_base_reduce_intra_in_order_binary(sbuf, rbuf, count, dtype, op, root, comm, module, - segsize, max_requests); + segsize, max_requests, + session); case (7): return ompi_coll_base_reduce_intra_redscat_gather(sbuf, rbuf, count, dtype, - op, root, comm, module); + op, root, comm, module, + session); case (8): return ompi_coll_base_reduce_intra_knomial(sbuf, rbuf, count, dtype, op, root, comm, module, segsize, max_requests, - faninout); + faninout, session); } /* switch */ OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "coll:tuned:reduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", diff --git a/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_block_decision.c b/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_block_decision.c index f4f6bdb7590..7a05ce9ee72 100644 --- a/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_block_decision.c +++ b/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_block_decision.c @@ -123,7 +123,8 @@ int ompi_coll_tuned_reduce_scatter_block_intra_do_this(const void *sbuf, void *r struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module, - int algorithm, int faninout, int segsize) + int algorithm, int faninout, int segsize, + ompi_op_gpu_session_t *session) { OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "coll:tuned:reduce_scatter_block_intra_do_this selected algorithm %d topo faninout %d segsize %d", @@ -133,13 +134,16 @@ int ompi_coll_tuned_reduce_scatter_block_intra_do_this(const void *sbuf, void *r case (0): return ompi_coll_tuned_reduce_scatter_block_intra_dec_fixed(sbuf, rbuf, rcount, dtype, op, comm, module); case (1): return ompi_coll_base_reduce_scatter_block_basic_linear(sbuf, rbuf, rcount, - dtype, op, comm, module); + dtype, op, comm, module, + session); case (2): return ompi_coll_base_reduce_scatter_block_intra_recursivedoubling(sbuf, rbuf, rcount, - dtype, op, comm, module); + dtype, op, comm, module, + session); case (3): return ompi_coll_base_reduce_scatter_block_intra_recursivehalving(sbuf, rbuf, rcount, - dtype, op, comm, module); + dtype, op, comm, module, + session); case (4): return ompi_coll_base_reduce_scatter_block_intra_butterfly(sbuf, rbuf, rcount, dtype, op, comm, - module); + module, session); } /* switch */ OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "coll:tuned:reduce_scatter_block_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", diff --git a/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_decision.c b/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_decision.c index 16747598b6e..5c79333e567 100644 --- a/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_decision.c +++ b/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_decision.c @@ -130,7 +130,8 @@ int ompi_coll_tuned_reduce_scatter_intra_do_this(const void *sbuf, void* rbuf, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module, - int algorithm, int faninout, int segsize) + int algorithm, int faninout, int segsize, + ompi_op_gpu_session_t *session) { OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "coll:tuned:reduce_scatter_intra_do_this selected algorithm %d topo faninout %d segsize %d", @@ -140,13 +141,17 @@ int ompi_coll_tuned_reduce_scatter_intra_do_this(const void *sbuf, void* rbuf, case (0): return ompi_coll_tuned_reduce_scatter_intra_dec_fixed(sbuf, rbuf, rcounts, dtype, op, comm, module); case (1): return ompi_coll_base_reduce_scatter_intra_nonoverlapping(sbuf, rbuf, rcounts, - dtype, op, comm, module); + dtype, op, comm, module, + session); case (2): return ompi_coll_base_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts, - dtype, op, comm, module); + dtype, op, comm, module, + session); case (3): return ompi_coll_base_reduce_scatter_intra_ring(sbuf, rbuf, rcounts, - dtype, op, comm, module); + dtype, op, comm, module, + session); case (4): return ompi_coll_base_reduce_scatter_intra_butterfly(sbuf, rbuf, rcounts, - dtype, op, comm, module); + dtype, op, comm, module, + session); } /* switch */ OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "coll:tuned:reduce_scatter_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", diff --git a/ompi/mca/coll/tuned/coll_tuned_scan_decision.c b/ompi/mca/coll/tuned/coll_tuned_scan_decision.c index 903e76c4694..e16a2376c65 100644 --- a/ompi/mca/coll/tuned/coll_tuned_scan_decision.c +++ b/ompi/mca/coll/tuned/coll_tuned_scan_decision.c @@ -93,7 +93,7 @@ int ompi_coll_tuned_scan_intra_do_this(const void *sbuf, void* rbuf, size_t coun struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module, - int algorithm) + int algorithm, ompi_op_gpu_session_t *session) { OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "coll:tuned:scan_intra_do_this selected algorithm %d", @@ -104,7 +104,7 @@ int ompi_coll_tuned_scan_intra_do_this(const void *sbuf, void* rbuf, size_t coun case (1): return ompi_coll_base_scan_intra_linear(sbuf, rbuf, count, dtype, op, comm, module); case (2): return ompi_coll_base_scan_intra_recursivedoubling(sbuf, rbuf, count, dtype, - op, comm, module); + op, comm, module, session); } /* switch */ OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "coll:tuned:scan_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", diff --git a/ompi/mca/coll/tuned/coll_tuned_scatter_decision.c b/ompi/mca/coll/tuned/coll_tuned_scatter_decision.c index b1449b2955c..89a78f6fab6 100644 --- a/ompi/mca/coll/tuned/coll_tuned_scatter_decision.c +++ b/ompi/mca/coll/tuned/coll_tuned_scatter_decision.c @@ -163,7 +163,8 @@ ompi_coll_tuned_scatter_intra_do_this(const void *sbuf, size_t scount, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module, - int algorithm, int faninout, int segsize) + int algorithm, int faninout, int segsize, + mca_allocator_base_module_t *allocator) { OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream, "coll:tuned:scatter_intra_do_this selected algorithm %d topo faninout %d segsize %d", @@ -181,7 +182,7 @@ ompi_coll_tuned_scatter_intra_do_this(const void *sbuf, size_t scount, case (2): return ompi_coll_base_scatter_intra_binomial(sbuf, scount, sdtype, rbuf, rcount, rdtype, - root, comm, module); + root, comm, module, allocator); case (3): return ompi_coll_base_scatter_intra_linear_nb(sbuf, scount, sdtype, rbuf, rcount, rdtype, diff --git a/ompi/mca/op/base/op_base_frame.c b/ompi/mca/op/base/op_base_frame.c index 90167300851..37052782b32 100644 --- a/ompi/mca/op/base/op_base_frame.c +++ b/ompi/mca/op/base/op_base_frame.c @@ -29,6 +29,7 @@ #include "ompi/constants.h" #include "ompi/mca/op/op.h" #include "ompi/mca/op/base/base.h" +#include "ompi/op/op_gpu_session.h" /* @@ -59,5 +60,19 @@ OBJ_CLASS_INSTANCE(ompi_op_base_module_t, opal_object_t, OBJ_CLASS_INSTANCE(ompi_op_base_module_1_0_0_t, opal_object_t, module_constructor_1_0_0, NULL); -MCA_BASE_FRAMEWORK_DECLARE(ompi, op, NULL, NULL, NULL, NULL, +static int +op_base_open(mca_base_open_flag_t flags) +{ + ompi_op_gpu_session_pool_init(); + return OMPI_SUCCESS; +} + +static int +op_base_close(void) +{ + ompi_op_gpu_session_pool_finalize(); + return OMPI_SUCCESS; +} + +MCA_BASE_FRAMEWORK_DECLARE(ompi, op, NULL, NULL, op_base_open, op_base_close, mca_op_base_static_components, 0); diff --git a/ompi/mca/op/cuda/Makefile.am b/ompi/mca/op/cuda/Makefile.am new file mode 100644 index 00000000000..b8d8ee71dd7 --- /dev/null +++ b/ompi/mca/op/cuda/Makefile.am @@ -0,0 +1,84 @@ +# +# Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All rights +# reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# C sources (compiled by the normal C compiler) +sources = \ + op_cuda.h \ + op_cuda_component.c \ + op_cuda_session.c + +# The .cu file is compiled separately by nvcc and linked in as a plain +# object. Libtool does not know how to compile CUDA, so we use a custom +# rule. The resulting object is appended to LIBADD for both DSO and static +# builds. + +EXTRA_DIST = op_cuda_kernels.cu +CLEANFILES = op_cuda_kernels.o op_cuda_kernels.lo + +# Include paths forwarded to nvcc so it can find ompi_config.h and the +# op/mca headers. +NVCC_INCLUDES = \ + -I$(top_srcdir) \ + -I$(top_builddir) \ + -I$(top_srcdir)/ompi \ + -I$(top_builddir)/ompi \ + -I$(top_builddir)/opal \ + -I$(top_builddir)/opal/include \ + -I$(top_srcdir)/ompi/include \ + -I$(top_srcdir)/opal/include + +# Compile the .cu file with nvcc. Always pass -fPIC so the same object can +# be used for both static and DSO builds. -D_Float16=short papers over a +# GCC extension (_Float16) that NVCC's host-compiler frontend does not support. +op_cuda_kernels.o: $(srcdir)/op_cuda_kernels.cu \ + $(srcdir)/op_cuda.h + $(NVCC) $(NVCCFLAGS) $(NVCC_INCLUDES) \ + $(op_cuda_CPPFLAGS) \ + --compiler-options "$(DEFS) -D_Float16=short -fPIC" \ + -c $< -o $@ + +# Wrap the nvcc output in a libtool object file (.lo) so it can be properly +# included in noinst_LTLIBRARIES via LIBADD. +op_cuda_kernels.lo: op_cuda_kernels.o + @{ echo '# Generated by libtool (CUDA kernel; PIC via --compiler-options)'; \ + echo "pic_object='op_cuda_kernels.o'"; \ + echo "non_pic_object='op_cuda_kernels.o'"; \ + } > $@ + +AM_CPPFLAGS = $(op_cuda_CPPFLAGS) + +# ---------------------------------------------------------------------------- +# DSO build +# ---------------------------------------------------------------------------- +if MCA_BUILD_ompi_op_cuda_DSO +component_install = mca_op_cuda.la +component_noinst = +else +component_install = +component_noinst = libmca_op_cuda.la +endif + +mcacomponentdir = $(ompilibdir) +mcacomponent_LTLIBRARIES = $(component_install) + +mca_op_cuda_la_SOURCES = $(sources) +mca_op_cuda_la_LDFLAGS = -module -avoid-version $(op_cuda_LDFLAGS) +mca_op_cuda_la_LIBADD = $(op_cuda_LIBS) op_cuda_kernels.lo +mca_op_cuda_la_CPPFLAGS = $(op_cuda_CPPFLAGS) + +# ---------------------------------------------------------------------------- +# Static (convenience library) build +# ---------------------------------------------------------------------------- +noinst_LTLIBRARIES = $(component_noinst) + +libmca_op_cuda_la_SOURCES = $(sources) +libmca_op_cuda_la_LDFLAGS = -module -avoid-version $(op_cuda_LDFLAGS) +libmca_op_cuda_la_LIBADD = $(op_cuda_LIBS) op_cuda_kernels.lo +libmca_op_cuda_la_CPPFLAGS = $(op_cuda_CPPFLAGS) diff --git a/ompi/mca/op/cuda/configure.m4 b/ompi/mca/op/cuda/configure.m4 new file mode 100644 index 00000000000..71096bca74d --- /dev/null +++ b/ompi/mca/op/cuda/configure.m4 @@ -0,0 +1,127 @@ +# -*- shell-script -*- +# +# Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All rights +# reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# MCA_ompi_op_cuda_CONFIG([action-if-can-compile], +# [action-if-cant-compile]) +# ------------------------------------------------ +# Build the CUDA persistent-kernel op component when the CUDA runtime +# toolkit (cuda_runtime.h, libcudart, nvcc) is available. +# +# Deliberately does NOT require CUDA_SUPPORT=1 (which gates on libcuda.so, +# the GPU driver API library). The op/cuda component only uses the runtime +# API and can therefore be compiled in build environments that have the CUDA +# toolkit installed but no GPU driver (e.g., CI containers, cross-build nodes). +# +# Requires --with-cuda[=DIR] to locate the toolkit. +# +# Sets: +# op_cuda_CPPFLAGS — include path for cuda_runtime.h +# op_cuda_LDFLAGS — library search path for libcudart +# op_cuda_LIBS — -lcudart +# NVCC — path to the nvcc compiler +# NVCCFLAGS — default nvcc flags (min arch SM 7.0 for __nanosleep) +# +AC_DEFUN([MCA_ompi_op_cuda_CONFIG],[ + AC_CONFIG_FILES([ompi/mca/op/cuda/Makefile]) + + # Ensure with_cuda is defined (OPAL_CHECK_CUDA parses --with-cuda). + AC_REQUIRE([OPAL_CHECK_CUDA]) + + OPAL_VAR_SCOPE_PUSH([op_cuda_save_CPPFLAGS op_cuda_save_LDFLAGS op_cuda_save_LIBS op_cuda_libdir op_cuda_nvcc_path op_cuda_incdir]) + + op_cuda_happy=no + op_cuda_incdir="" + + # Only attempt a build when the user asked for CUDA (--with-cuda[=DIR]). + AS_IF([test "x$with_cuda" != "x" && test "$with_cuda" != "no"], + [ + # Derive the include directory from $with_cuda, mirroring OPAL_CHECK_CUDA. + AS_IF([test -f "${with_cuda}/include/cuda_runtime.h"], + [op_cuda_incdir="${with_cuda}/include"], + [AS_IF([test -f "${with_cuda}/cuda_runtime.h"], + [op_cuda_incdir="${with_cuda}"], + [AS_IF([test -f "/usr/local/cuda/include/cuda_runtime.h"], + [op_cuda_incdir="/usr/local/cuda/include"])])]) + + op_cuda_save_CPPFLAGS="$CPPFLAGS" + op_cuda_save_LDFLAGS="$LDFLAGS" + op_cuda_save_LIBS="$LIBS" + + AS_IF([test -n "$op_cuda_incdir"], + [CPPFLAGS="-I$op_cuda_incdir $CPPFLAGS"]) + + # Verify the runtime header is present. + AC_CHECK_HEADER([cuda_runtime.h], + [op_cuda_happy=yes], + [AC_MSG_WARN([cuda_runtime.h not found; skipping op/cuda component]) + op_cuda_happy=no]) + + # Locate libcudart — prefer lib64, fall back to lib, then /usr/local/cuda. + AS_IF([test "$op_cuda_happy" = "yes"], + [op_cuda_libdir="" + AS_IF([test "$with_cuda" != "yes"], + [AS_IF([test -d "$with_cuda/lib64"], + [op_cuda_libdir="$with_cuda/lib64"], + [AS_IF([test -d "$with_cuda/lib"], + [op_cuda_libdir="$with_cuda/lib"])])]) + AS_IF([test -z "$op_cuda_libdir"], + [AS_IF([test -d "/usr/local/cuda/lib64"], + [op_cuda_libdir="/usr/local/cuda/lib64"], + [AS_IF([test -d "/usr/local/cuda/lib"], + [op_cuda_libdir="/usr/local/cuda/lib"])])]) + AS_IF([test -n "$op_cuda_libdir"], + [LDFLAGS="-L$op_cuda_libdir $LDFLAGS"]) + AC_CHECK_LIB([cudart], [cudaGetDeviceCount], + [op_cuda_happy=yes], + [AC_MSG_WARN([libcudart not found; skipping op/cuda component]) + op_cuda_happy=no]) + ]) + + # Locate nvcc. + AS_IF([test "$op_cuda_happy" = "yes"], + [op_cuda_nvcc_path="$PATH" + AS_IF([test "$with_cuda" != "yes" && test -d "$with_cuda/bin"], + [op_cuda_nvcc_path="$with_cuda/bin:$PATH"]) + AC_PATH_PROG([NVCC], [nvcc], [not_found], [$op_cuda_nvcc_path]) + AS_IF([test "$NVCC" = "not_found"], + [AC_MSG_WARN([nvcc not found; skipping op/cuda component]) + op_cuda_happy=no]) + ]) + + # Populate the output variables. + AS_IF([test "$op_cuda_happy" = "yes"], + [op_cuda_CPPFLAGS="-I$op_cuda_incdir" + AS_IF([test -n "$op_cuda_libdir"], + [op_cuda_LDFLAGS="-L$op_cuda_libdir"], + [op_cuda_LDFLAGS=""]) + op_cuda_LIBS="-lcudart" + # __nanosleep requires SM 7.0 (Volta) or later. + AS_IF([test "x$NVCCFLAGS" = "x"], + [NVCCFLAGS="-arch=sm_70"]) + ]) + + CPPFLAGS="$op_cuda_save_CPPFLAGS" + LDFLAGS="$op_cuda_save_LDFLAGS" + LIBS="$op_cuda_save_LIBS" + ]) + + AC_SUBST([op_cuda_CPPFLAGS]) + AC_SUBST([op_cuda_LDFLAGS]) + AC_SUBST([op_cuda_LIBS]) + AC_SUBST([NVCC]) + AC_SUBST([NVCCFLAGS]) + + OPAL_VAR_SCOPE_POP + + AS_IF([test "$op_cuda_happy" = "yes"], + [$1], + [$2]) +])dnl diff --git a/ompi/mca/op/cuda/op_cuda.h b/ompi/mca/op/cuda/op_cuda.h new file mode 100644 index 00000000000..45930c8351f --- /dev/null +++ b/ompi/mca/op/cuda/op_cuda.h @@ -0,0 +1,66 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OMPI_MCA_OP_CUDA_H +#define OMPI_MCA_OP_CUDA_H + +#include "ompi_config.h" +#include + +#include "ompi/mca/op/op.h" +#include "ompi/op/op_gpu_session.h" /* defines ompi_op_gpu_cmd_t */ + +BEGIN_C_DECLS + +/** + * CUDA-specific cmd_queue. Inherits ompi_op_gpu_cmd_queue_t by placing it + * as the first member named "super". The CUDA stream and shutdown flag are + * stored directly here rather than in a separate priv allocation. + * Allocated with OBJ_NEW; the OBJ destructor chain releases GPU resources. + */ +typedef struct ompi_op_cuda_cmd_queue_t { + ompi_op_gpu_cmd_queue_t super; /* MUST be first */ + volatile int32_t *shutdown; /* managed-memory shutdown flag */ + cudaStream_t stream; /* private CUDA stream */ +} ompi_op_cuda_cmd_queue_t; +OBJ_CLASS_DECLARATION(ompi_op_cuda_cmd_queue_t); + +/** + * Host-side launcher function type. + * Launches the persistent kernel for one (op, type) combination. + */ +typedef void (*ompi_op_cuda_launcher_fn_t)(ompi_op_gpu_cmd_t *cmd, + volatile int32_t *shutdown, + cudaStream_t stream); + +/** + * 2D table [op_index][type_index] of launcher functions. + * NULL entries indicate unsupported (op, type) combinations; the session + * machinery returns NULL for those and the caller falls back to the host path. + * + * Indexed by OMPI_OP_BASE_FORTRAN_* × OMPI_OP_BASE_TYPE_*. + * Defined (and initialized) in op_cuda_kernels.cu. + */ +OMPI_DECLSPEC extern ompi_op_cuda_launcher_fn_t +ompi_op_cuda_kernel_fns[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX]; + +/* Defined in op_cuda_kernels.cu (extern "C") */ +void ompi_op_cuda_kernel_fns_init(void); + +/* Defined in op_cuda_session.c */ +ompi_op_gpu_cmd_queue_t *ompi_op_cuda_cmd_queue_alloc(int dev_id); +ompi_op_gpu_session_t *ompi_op_cuda_session_begin(ompi_op_gpu_cmd_queue_t *queue, + struct ompi_op_t *op, + struct ompi_datatype_t *dtype); + +END_C_DECLS + +#endif /* OMPI_MCA_OP_CUDA_H */ diff --git a/ompi/mca/op/cuda/op_cuda_component.c b/ompi/mca/op/cuda/op_cuda_component.c new file mode 100644 index 00000000000..36638f8a578 --- /dev/null +++ b/ompi/mca/op/cuda/op_cuda_component.c @@ -0,0 +1,101 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include + +#include "ompi/constants.h" +#include "ompi/op/op.h" +#include "ompi/mca/op/op.h" +#include "ompi/mca/op/base/base.h" +#include "ompi/op/op_gpu_session.h" +#include "ompi/mca/op/cuda/op_cuda.h" + + +static int cuda_component_open(void); +static int cuda_component_close(void); +static int cuda_component_init_query(bool enable_progress_threads, + bool enable_mpi_thread_multiple); +static struct ompi_op_base_module_1_0_0_t * + cuda_component_op_query(struct ompi_op_t *op, int *priority); + +/* + * Public component descriptor. + * + * This component does not provide per-op/per-type function pointers + * (opc_op_query returns NULL). Its sole contribution is the three GPU + * hooks that enable persistent-kernel GPU reduction sessions. + */ +ompi_op_base_component_1_0_0_t mca_op_cuda_component = { + .opc_version = { + OMPI_OP_BASE_VERSION_1_0_0, + + .mca_component_name = "cuda", + MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, + OMPI_RELEASE_VERSION), + .mca_open_component = cuda_component_open, + .mca_close_component = cuda_component_close, + }, + .opc_data = { + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + + .opc_init_query = cuda_component_init_query, + .opc_op_query = cuda_component_op_query, + + /* GPU session hooks */ + .opc_cmd_queue_alloc = ompi_op_cuda_cmd_queue_alloc, + .opc_session_begin = ompi_op_cuda_session_begin, +}; +MCA_BASE_COMPONENT_INIT(ompi, op, cuda) + +static int +cuda_component_open(void) +{ + return OMPI_SUCCESS; +} + +static int +cuda_component_close(void) +{ + return OMPI_SUCCESS; +} + +/* + * Only activate this component when at least one CUDA-capable device is + * present in the system. + */ +static int +cuda_component_init_query(bool enable_progress_threads, + bool enable_mpi_thread_multiple) +{ + int device_count = 0; + cudaError_t err = cudaGetDeviceCount(&device_count); + if (cudaSuccess != err || device_count <= 0) { + return OMPI_ERR_NOT_SUPPORTED; + } + // register launchers here, component_open seems to be never called + ompi_op_cuda_kernel_fns_init(); + return OMPI_SUCCESS; +} + +/* + * We do not provide per-op function pointers, only session hooks, so + * always return NULL here. + */ +static struct ompi_op_base_module_1_0_0_t * +cuda_component_op_query(struct ompi_op_t *op, int *priority) +{ + (void) op; + (void) priority; + return NULL; +} diff --git a/ompi/mca/op/cuda/op_cuda_kernels.cu b/ompi/mca/op/cuda/op_cuda_kernels.cu new file mode 100644 index 00000000000..03b1b11bd7a --- /dev/null +++ b/ompi/mca/op/cuda/op_cuda_kernels.cu @@ -0,0 +1,285 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/* + * Persistent reduction kernels for the CUDA op component. + * + * Each kernel runs one block of 256 threads and loops indefinitely, + * sleeping between polls to reduce power consumption. The host posts + * a command by writing src/dst/count into the managed-memory slot and + * then setting status=1. The kernel executes the reduction, then sets + * status=2. The host spins on status until it sees 2, then resets it + * to 0 for the next call. A separate shutdown flag terminates the loop + * at session end. + */ + +#include +#include + +#include "ompi/mca/op/op.h" +#include "ompi/mca/op/cuda/op_cuda.h" + +/* ------------------------------------------------------------------------- + * PERSISTENT_KERNEL(name, ctype, op_expr) + * + * Generates __global__ void ompi_op_cuda_persistent_(...). + * op_expr must be a statement writing dst[i] from src1[i] and src2[i], + * e.g. "dst[i] = src1[i] + src2[i]". src2 may alias dst for in-place ops. + * ------------------------------------------------------------------------- */ +#define PERSISTENT_KERNEL(kname, ctype, op_expr) \ +__global__ void ompi_op_cuda_persistent_##kname( \ + ompi_op_gpu_cmd_t *cmd, volatile int32_t *shutdown) \ +{ \ + while (!*shutdown) { \ + /* Spin-wait for work; sleep 1 µs between polls to save power */ \ + while (cmd->status != 1 && !*shutdown) { __nanosleep(1000); } \ + if (*shutdown) break; \ + const ctype * __restrict__ src1 = (const ctype *) cmd->src1; \ + const ctype * __restrict__ src2 = (const ctype *) cmd->src2; \ + ctype * __restrict__ dst = ( ctype *) cmd->dst; \ + int64_t n = cmd->count; \ + for (int64_t i = (int64_t)threadIdx.x; i < n; i += blockDim.x) { \ + op_expr; \ + } \ + __syncthreads(); \ + if (threadIdx.x == 0) { \ + __threadfence_system(); /* ensure dst writes reach host */ \ + cmd->status = 2; /* signal done */ \ + } \ + } \ +} + +/* ========================================================================= + * Kernel instantiations + * ========================================================================= */ + +/* --- MAX --- */ +PERSISTENT_KERNEL(max_int8, int8_t, dst[i] = src1[i] > src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(max_uint8, uint8_t, dst[i] = src1[i] > src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(max_int16, int16_t, dst[i] = src1[i] > src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(max_uint16, uint16_t, dst[i] = src1[i] > src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(max_int32, int32_t, dst[i] = src1[i] > src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(max_uint32, uint32_t, dst[i] = src1[i] > src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(max_int64, int64_t, dst[i] = src1[i] > src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(max_uint64, uint64_t, dst[i] = src1[i] > src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(max_float, float, dst[i] = src1[i] > src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(max_double, double, dst[i] = src1[i] > src2[i] ? src1[i] : src2[i]) + +/* --- MIN --- */ +PERSISTENT_KERNEL(min_int8, int8_t, dst[i] = src1[i] < src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(min_uint8, uint8_t, dst[i] = src1[i] < src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(min_int16, int16_t, dst[i] = src1[i] < src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(min_uint16, uint16_t, dst[i] = src1[i] < src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(min_int32, int32_t, dst[i] = src1[i] < src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(min_uint32, uint32_t, dst[i] = src1[i] < src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(min_int64, int64_t, dst[i] = src1[i] < src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(min_uint64, uint64_t, dst[i] = src1[i] < src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(min_float, float, dst[i] = src1[i] < src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(min_double, double, dst[i] = src1[i] < src2[i] ? src1[i] : src2[i]) + +/* --- SUM --- */ +PERSISTENT_KERNEL(sum_int8, int8_t, dst[i] = src1[i] + src2[i]) +PERSISTENT_KERNEL(sum_uint8, uint8_t, dst[i] = src1[i] + src2[i]) +PERSISTENT_KERNEL(sum_int16, int16_t, dst[i] = src1[i] + src2[i]) +PERSISTENT_KERNEL(sum_uint16, uint16_t, dst[i] = src1[i] + src2[i]) +PERSISTENT_KERNEL(sum_int32, int32_t, dst[i] = src1[i] + src2[i]) +PERSISTENT_KERNEL(sum_uint32, uint32_t, dst[i] = src1[i] + src2[i]) +PERSISTENT_KERNEL(sum_int64, int64_t, dst[i] = src1[i] + src2[i]) +PERSISTENT_KERNEL(sum_uint64, uint64_t, dst[i] = src1[i] + src2[i]) +PERSISTENT_KERNEL(sum_float, float, dst[i] = src1[i] + src2[i]) +PERSISTENT_KERNEL(sum_double, double, dst[i] = src1[i] + src2[i]) + +/* --- PROD --- */ +PERSISTENT_KERNEL(prod_int8, int8_t, dst[i] = src1[i] * src2[i]) +PERSISTENT_KERNEL(prod_uint8, uint8_t, dst[i] = src1[i] * src2[i]) +PERSISTENT_KERNEL(prod_int16, int16_t, dst[i] = src1[i] * src2[i]) +PERSISTENT_KERNEL(prod_uint16, uint16_t, dst[i] = src1[i] * src2[i]) +PERSISTENT_KERNEL(prod_int32, int32_t, dst[i] = src1[i] * src2[i]) +PERSISTENT_KERNEL(prod_uint32, uint32_t, dst[i] = src1[i] * src2[i]) +PERSISTENT_KERNEL(prod_int64, int64_t, dst[i] = src1[i] * src2[i]) +PERSISTENT_KERNEL(prod_uint64, uint64_t, dst[i] = src1[i] * src2[i]) +PERSISTENT_KERNEL(prod_float, float, dst[i] = src1[i] * src2[i]) +PERSISTENT_KERNEL(prod_double, double, dst[i] = src1[i] * src2[i]) + +/* --- BAND (bitwise AND, integer types only) --- */ +PERSISTENT_KERNEL(band_int8, int8_t, dst[i] = src1[i] & src2[i]) +PERSISTENT_KERNEL(band_uint8, uint8_t, dst[i] = src1[i] & src2[i]) +PERSISTENT_KERNEL(band_int16, int16_t, dst[i] = src1[i] & src2[i]) +PERSISTENT_KERNEL(band_uint16, uint16_t, dst[i] = src1[i] & src2[i]) +PERSISTENT_KERNEL(band_int32, int32_t, dst[i] = src1[i] & src2[i]) +PERSISTENT_KERNEL(band_uint32, uint32_t, dst[i] = src1[i] & src2[i]) +PERSISTENT_KERNEL(band_int64, int64_t, dst[i] = src1[i] & src2[i]) +PERSISTENT_KERNEL(band_uint64, uint64_t, dst[i] = src1[i] & src2[i]) + +/* --- BOR (bitwise OR) --- */ +PERSISTENT_KERNEL(bor_int8, int8_t, dst[i] = src1[i] | src2[i]) +PERSISTENT_KERNEL(bor_uint8, uint8_t, dst[i] = src1[i] | src2[i]) +PERSISTENT_KERNEL(bor_int16, int16_t, dst[i] = src1[i] | src2[i]) +PERSISTENT_KERNEL(bor_uint16, uint16_t, dst[i] = src1[i] | src2[i]) +PERSISTENT_KERNEL(bor_int32, int32_t, dst[i] = src1[i] | src2[i]) +PERSISTENT_KERNEL(bor_uint32, uint32_t, dst[i] = src1[i] | src2[i]) +PERSISTENT_KERNEL(bor_int64, int64_t, dst[i] = src1[i] | src2[i]) +PERSISTENT_KERNEL(bor_uint64, uint64_t, dst[i] = src1[i] | src2[i]) + +/* --- BXOR (bitwise XOR) --- */ +PERSISTENT_KERNEL(bxor_int8, int8_t, dst[i] = src1[i] ^ src2[i]) +PERSISTENT_KERNEL(bxor_uint8, uint8_t, dst[i] = src1[i] ^ src2[i]) +PERSISTENT_KERNEL(bxor_int16, int16_t, dst[i] = src1[i] ^ src2[i]) +PERSISTENT_KERNEL(bxor_uint16, uint16_t, dst[i] = src1[i] ^ src2[i]) +PERSISTENT_KERNEL(bxor_int32, int32_t, dst[i] = src1[i] ^ src2[i]) +PERSISTENT_KERNEL(bxor_uint32, uint32_t, dst[i] = src1[i] ^ src2[i]) +PERSISTENT_KERNEL(bxor_int64, int64_t, dst[i] = src1[i] ^ src2[i]) +PERSISTENT_KERNEL(bxor_uint64, uint64_t, dst[i] = src1[i] ^ src2[i]) + +/* ========================================================================= + * Host-side launcher wrappers — one per kernel, 1 block × 256 threads. + * ========================================================================= */ +#define LAUNCHER(kname) \ +static void launch_##kname(ompi_op_gpu_cmd_t *cmd, \ + volatile int32_t *sd, \ + cudaStream_t stream) \ +{ \ + ompi_op_cuda_persistent_##kname<<<1, 256, 0, stream>>>(cmd, sd); \ +} + +LAUNCHER(max_int8) LAUNCHER(max_uint8) +LAUNCHER(max_int16) LAUNCHER(max_uint16) +LAUNCHER(max_int32) LAUNCHER(max_uint32) +LAUNCHER(max_int64) LAUNCHER(max_uint64) +LAUNCHER(max_float) LAUNCHER(max_double) + +LAUNCHER(min_int8) LAUNCHER(min_uint8) +LAUNCHER(min_int16) LAUNCHER(min_uint16) +LAUNCHER(min_int32) LAUNCHER(min_uint32) +LAUNCHER(min_int64) LAUNCHER(min_uint64) +LAUNCHER(min_float) LAUNCHER(min_double) + +LAUNCHER(sum_int8) LAUNCHER(sum_uint8) +LAUNCHER(sum_int16) LAUNCHER(sum_uint16) +LAUNCHER(sum_int32) LAUNCHER(sum_uint32) +LAUNCHER(sum_int64) LAUNCHER(sum_uint64) +LAUNCHER(sum_float) LAUNCHER(sum_double) + +LAUNCHER(prod_int8) LAUNCHER(prod_uint8) +LAUNCHER(prod_int16) LAUNCHER(prod_uint16) +LAUNCHER(prod_int32) LAUNCHER(prod_uint32) +LAUNCHER(prod_int64) LAUNCHER(prod_uint64) +LAUNCHER(prod_float) LAUNCHER(prod_double) + +LAUNCHER(band_int8) LAUNCHER(band_uint8) +LAUNCHER(band_int16) LAUNCHER(band_uint16) +LAUNCHER(band_int32) LAUNCHER(band_uint32) +LAUNCHER(band_int64) LAUNCHER(band_uint64) + +LAUNCHER(bor_int8) LAUNCHER(bor_uint8) +LAUNCHER(bor_int16) LAUNCHER(bor_uint16) +LAUNCHER(bor_int32) LAUNCHER(bor_uint32) +LAUNCHER(bor_int64) LAUNCHER(bor_uint64) + +LAUNCHER(bxor_int8) LAUNCHER(bxor_uint8) +LAUNCHER(bxor_int16) LAUNCHER(bxor_uint16) +LAUNCHER(bxor_int32) LAUNCHER(bxor_uint32) +LAUNCHER(bxor_int64) LAUNCHER(bxor_uint64) + +/* ========================================================================= + * 2D launcher table [op_index][type_index] + * + * Indexed by OMPI_OP_BASE_FORTRAN_* (rows) × OMPI_OP_BASE_TYPE_* (columns). + * Zero/NULL entries mean "not supported on GPU" → host fallback. + * + * Zero-initialized here; filled by ompi_op_cuda_kernel_fns_init() called + * from cuda_component_open(). The init function uses plain assignment + * instead of designated initializers to stay compatible with NVCC's C++ + * frontend, which does not support non-trivial designated initializers. + * ========================================================================= */ +ompi_op_cuda_launcher_fn_t +ompi_op_cuda_kernel_fns[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX]; + +void +ompi_op_cuda_kernel_fns_init(void) +{ +#define SET(op, type, fn) \ + ompi_op_cuda_kernel_fns[OMPI_OP_BASE_FORTRAN_##op][OMPI_OP_BASE_TYPE_##type] = (fn) + + SET(MAX, INT8_T, launch_max_int8); + SET(MAX, UINT8_T, launch_max_uint8); + SET(MAX, INT16_T, launch_max_int16); + SET(MAX, UINT16_T, launch_max_uint16); + SET(MAX, INT32_T, launch_max_int32); + SET(MAX, UINT32_T, launch_max_uint32); + SET(MAX, INT64_T, launch_max_int64); + SET(MAX, UINT64_T, launch_max_uint64); + SET(MAX, FLOAT, launch_max_float); + SET(MAX, DOUBLE, launch_max_double); + + SET(MIN, INT8_T, launch_min_int8); + SET(MIN, UINT8_T, launch_min_uint8); + SET(MIN, INT16_T, launch_min_int16); + SET(MIN, UINT16_T, launch_min_uint16); + SET(MIN, INT32_T, launch_min_int32); + SET(MIN, UINT32_T, launch_min_uint32); + SET(MIN, INT64_T, launch_min_int64); + SET(MIN, UINT64_T, launch_min_uint64); + SET(MIN, FLOAT, launch_min_float); + SET(MIN, DOUBLE, launch_min_double); + + SET(SUM, INT8_T, launch_sum_int8); + SET(SUM, UINT8_T, launch_sum_uint8); + SET(SUM, INT16_T, launch_sum_int16); + SET(SUM, UINT16_T, launch_sum_uint16); + SET(SUM, INT32_T, launch_sum_int32); + SET(SUM, UINT32_T, launch_sum_uint32); + SET(SUM, INT64_T, launch_sum_int64); + SET(SUM, UINT64_T, launch_sum_uint64); + SET(SUM, FLOAT, launch_sum_float); + SET(SUM, DOUBLE, launch_sum_double); + + SET(PROD, INT8_T, launch_prod_int8); + SET(PROD, UINT8_T, launch_prod_uint8); + SET(PROD, INT16_T, launch_prod_int16); + SET(PROD, UINT16_T, launch_prod_uint16); + SET(PROD, INT32_T, launch_prod_int32); + SET(PROD, UINT32_T, launch_prod_uint32); + SET(PROD, INT64_T, launch_prod_int64); + SET(PROD, UINT64_T, launch_prod_uint64); + SET(PROD, FLOAT, launch_prod_float); + SET(PROD, DOUBLE, launch_prod_double); + + SET(BAND, INT8_T, launch_band_int8); + SET(BAND, UINT8_T, launch_band_uint8); + SET(BAND, INT16_T, launch_band_int16); + SET(BAND, UINT16_T, launch_band_uint16); + SET(BAND, INT32_T, launch_band_int32); + SET(BAND, UINT32_T, launch_band_uint32); + SET(BAND, INT64_T, launch_band_int64); + SET(BAND, UINT64_T, launch_band_uint64); + + SET(BOR, INT8_T, launch_bor_int8); + SET(BOR, UINT8_T, launch_bor_uint8); + SET(BOR, INT16_T, launch_bor_int16); + SET(BOR, UINT16_T, launch_bor_uint16); + SET(BOR, INT32_T, launch_bor_int32); + SET(BOR, UINT32_T, launch_bor_uint32); + SET(BOR, INT64_T, launch_bor_int64); + SET(BOR, UINT64_T, launch_bor_uint64); + + SET(BXOR, INT8_T, launch_bxor_int8); + SET(BXOR, UINT8_T, launch_bxor_uint8); + SET(BXOR, INT16_T, launch_bxor_int16); + SET(BXOR, UINT16_T, launch_bxor_uint16); + SET(BXOR, INT32_T, launch_bxor_int32); + SET(BXOR, UINT32_T, launch_bxor_uint32); + SET(BXOR, INT64_T, launch_bxor_int64); + SET(BXOR, UINT64_T, launch_bxor_uint64); + + /* LAND, LOR, LXOR, MAXLOC, MINLOC, REPLACE, NO_OP: NULL → host path */ +#undef SET +} diff --git a/ompi/mca/op/cuda/op_cuda_session.c b/ompi/mca/op/cuda/op_cuda_session.c new file mode 100644 index 00000000000..c51c47307a8 --- /dev/null +++ b/ompi/mca/op/cuda/op_cuda_session.c @@ -0,0 +1,244 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/* + * Session lifecycle for the CUDA persistent-kernel op component. + * + * ompi_op_cuda_cmd_queue_t inherits ompi_op_gpu_cmd_queue_t. OBJ_NEW + * allocates the object; the OBJ destructor releases the CUDA stream and + * managed memory. The component returns the base pointer from alloc so + * callers need no knowledge of the concrete type. + * + * session_begin: look up the kernel for (op, dtype), reset the cmd_queue + * state, and launch the persistent kernel on the existing + * stream. Wires all session dispatch hooks and returns the + * session. Returns NULL if no kernel exists. + * + * session_reduce: write src/dst/count to the command slot, set status=1 + * to wake the kernel, and spin until status==2. + * + * session_stop: signal the persistent kernel to exit and synchronize the + * stream. The cmd_queue's GPU stream and managed memory + * remain allocated for reuse. + */ + +#include "ompi_config.h" +#include +#include +#include + +#include + +#include "opal/mca/accelerator/base/base.h" +#include "ompi/op/op.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/op/op_gpu_session.h" +#include "ompi/mca/op/op.h" +#include "ompi/mca/op/cuda/op_cuda.h" + +/* ompi_op_ddt_map[] maps dtype->id → OMPI_OP_BASE_TYPE_* (-1 if none) */ +extern int ompi_op_ddt_map[OMPI_DATATYPE_MAX_PREDEFINED]; + +/* Forward declarations of static session hooks referenced from session_begin. */ +static void ompi_op_cuda_session_reduce(ompi_op_gpu_session_t *session, + const void *src1, const void *src2, + void *dst, size_t count); +static void ompi_op_cuda_session_stop(ompi_op_gpu_session_t *session); + +/* -------------------------------------------------------------------------- + * OBJ constructor / destructor for ompi_op_cuda_cmd_queue_t + * -------------------------------------------------------------------------- */ +static void +ompi_op_cuda_cmd_queue_construct(ompi_op_cuda_cmd_queue_t *q) +{ + q->shutdown = NULL; + q->stream = NULL; + q->super.cmd = NULL; + q->super.dev_id = -1; + q->super.allocator = NULL; + q->super.session_begin_fn = NULL; +} + +static void +ompi_op_cuda_cmd_queue_destruct(ompi_op_cuda_cmd_queue_t *q) +{ + if (NULL != q->stream) { + cudaStreamDestroy(q->stream); + q->stream = NULL; + } + if (NULL != q->shutdown) { + cudaFree((void *) q->shutdown); + q->shutdown = NULL; + } + if (NULL != q->super.cmd) { + cudaFree(q->super.cmd); + q->super.cmd = NULL; + } +} + +OBJ_CLASS_INSTANCE(ompi_op_cuda_cmd_queue_t, + ompi_op_gpu_cmd_queue_t, + ompi_op_cuda_cmd_queue_construct, + ompi_op_cuda_cmd_queue_destruct); + +/* -------------------------------------------------------------------------- + * ompi_op_cuda_cmd_queue_alloc + * + * Allocate the expensive GPU resources for one device: a managed-memory + * command slot, a managed-memory shutdown flag, and a private CUDA stream. + * Returns the base pointer (ompi_op_gpu_cmd_queue_t *); NULL on failure. + * -------------------------------------------------------------------------- */ +ompi_op_gpu_cmd_queue_t * +ompi_op_cuda_cmd_queue_alloc(int dev_id) +{ + ompi_op_cuda_cmd_queue_t *q = OBJ_NEW(ompi_op_cuda_cmd_queue_t); + if (NULL == q) { + return NULL; + } + + cudaError_t err; + + err = cudaMallocManaged((void **) &q->super.cmd, + sizeof(ompi_op_gpu_cmd_t), + cudaMemAttachGlobal); + if (cudaSuccess != err) { + OBJ_RELEASE(q); + return NULL; + } + q->super.cmd->src1 = NULL; + q->super.cmd->src2 = NULL; + q->super.cmd->dst = NULL; + q->super.cmd->count = 0; + q->super.cmd->status = 0; + + err = cudaMallocManaged((void **) &q->shutdown, + sizeof(int32_t), + cudaMemAttachGlobal); + if (cudaSuccess != err) { + OBJ_RELEASE(q); + return NULL; + } + *q->shutdown = 0; + + err = cudaStreamCreateWithFlags(&q->stream, cudaStreamNonBlocking); + if (cudaSuccess != err) { + OBJ_RELEASE(q); + return NULL; + } + + q->super.dev_id = dev_id; + q->super.allocator = opal_accelerator_base_get_device_allocator(dev_id); + return &q->super; +} + +/* -------------------------------------------------------------------------- + * ompi_op_cuda_session_begin + * + * Look up the GPU kernel for (op, dtype), reset the cmd_queue state, and + * launch the persistent kernel on the existing stream. Wires all session + * dispatch hooks before returning. Returns NULL if no GPU kernel exists + * for this combination or if the kernel launch fails. + * -------------------------------------------------------------------------- */ +ompi_op_gpu_session_t * +ompi_op_cuda_session_begin(ompi_op_gpu_cmd_queue_t *queue, + struct ompi_op_t *op, + struct ompi_datatype_t *dtype) +{ + int op_idx = op->o_f_to_c_index; + int type_idx = (dtype->id < OMPI_DATATYPE_MAX_PREDEFINED) + ? ompi_op_ddt_map[dtype->id] : -1; + + if (op_idx < 0 || op_idx >= OMPI_OP_BASE_FORTRAN_OP_MAX || + type_idx < 0 || type_idx >= OMPI_OP_BASE_TYPE_MAX) { + return NULL; + } + + ompi_op_cuda_launcher_fn_t launcher = ompi_op_cuda_kernel_fns[op_idx][type_idx]; + if (NULL == launcher) { + return NULL; + } + + ompi_op_cuda_cmd_queue_t *cq = (ompi_op_cuda_cmd_queue_t *) queue; + + /* Reset queue state for the new kernel */ + *cq->shutdown = 0; + queue->cmd->src1 = NULL; + queue->cmd->src2 = NULL; + queue->cmd->dst = NULL; + queue->cmd->count = 0; + queue->cmd->status = 0; + + /* Launch the persistent kernel */ + launcher(queue->cmd, cq->shutdown, cq->stream); + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err) { + return NULL; + } + + ompi_op_gpu_session_t *session = + (ompi_op_gpu_session_t *) malloc(sizeof(ompi_op_gpu_session_t)); + if (NULL == session) { + return NULL; + } + + session->queue = queue; + session->allocator = queue->allocator; + session->reduce_fn = ompi_op_cuda_session_reduce; + session->stop_fn = ompi_op_cuda_session_stop; + return session; +} + +/* -------------------------------------------------------------------------- + * ompi_op_cuda_session_reduce + * -------------------------------------------------------------------------- */ +static void +ompi_op_cuda_session_reduce(ompi_op_gpu_session_t *session, + const void *src1, const void *src2, + void *dst, size_t count) +{ + ompi_op_gpu_cmd_t *cmd = session->queue->cmd; + + /* Write operands before signalling the kernel */ + cmd->src1 = src1; + cmd->src2 = src2; + cmd->dst = dst; + cmd->count = (int64_t) count; + + __atomic_thread_fence(__ATOMIC_SEQ_CST); /* ensure writes visible to GPU */ + cmd->status = 1; /* wake the kernel */ + + /* Spin-wait for the kernel to signal completion */ + while (2 != cmd->status) { + sched_yield(); /* relinquish CPU timeslice while waiting */ + } + + /* Reset for the next call */ + cmd->status = 0; +} + +/* -------------------------------------------------------------------------- + * ompi_op_cuda_session_stop + * + * Signal the persistent kernel to exit and wait for the stream to drain. + * The cmd_queue's stream and managed memory remain allocated for reuse. + * -------------------------------------------------------------------------- */ +static void +ompi_op_cuda_session_stop(ompi_op_gpu_session_t *session) +{ + ompi_op_cuda_cmd_queue_t *cq = (ompi_op_cuda_cmd_queue_t *) session->queue; + + /* Signal the kernel to exit its loop */ + *cq->shutdown = 1; + __atomic_thread_fence(__ATOMIC_SEQ_CST); + + /* Wait for the kernel to finish; stream remains valid after this */ + cudaStreamSynchronize(cq->stream); +} diff --git a/ompi/mca/op/op.h b/ompi/mca/op/op.h index 34d26376ab9..ffb21da034a 100644 --- a/ompi/mca/op/op.h +++ b/ompi/mca/op/op.h @@ -326,6 +326,30 @@ typedef struct ompi_op_base_module_1_0_0_t * (*ompi_op_base_component_op_query_1_0_0_fn_t) (struct ompi_op_t *op, int *priority); +/* Forward declarations for GPU types (defined in ompi/op/op_gpu_session.h) */ +struct ompi_op_gpu_cmd_queue_t; +struct ompi_op_gpu_session_t; + +/** + * Optional component hook: allocate the expensive GPU resources for a + * cmd_queue on the given device: managed-memory command slot, shutdown flag, + * and a private GPU stream. Returns NULL on allocation failure. + * The caller (op_gpu_session.c) wires session_begin_fn and free_fn. + */ +typedef struct ompi_op_gpu_cmd_queue_t * + (*ompi_op_base_component_cmd_queue_alloc_fn_t)(int dev_id); + +/** + * Optional component hook: look up the GPU kernel for (op, dtype), reset the + * cmd_queue state, and launch the persistent kernel on the queue's stream. + * Returns a fully-wired ompi_op_gpu_session_t on success, NULL if no GPU + * kernel exists for this (op, dtype) combination. + */ +typedef struct ompi_op_gpu_session_t * + (*ompi_op_base_component_session_begin_fn_t)(struct ompi_op_gpu_cmd_queue_t *queue, + struct ompi_op_t *op, + struct ompi_datatype_t *dtype); + /** * Op component interface. * @@ -343,6 +367,10 @@ typedef struct ompi_op_base_component_1_0_0_t { ompi_op_base_component_init_query_fn_t opc_init_query; /** Query whether component is usable for given op */ ompi_op_base_component_op_query_1_0_0_fn_t opc_op_query; + + /** Optional: GPU cmd_queue and session hooks. NULL in host-only components. */ + ompi_op_base_component_cmd_queue_alloc_fn_t opc_cmd_queue_alloc; + ompi_op_base_component_session_begin_fn_t opc_session_begin; } ompi_op_base_component_1_0_0_t; diff --git a/ompi/mca/op/rocm/Makefile.am b/ompi/mca/op/rocm/Makefile.am new file mode 100644 index 00000000000..a3bdfe08c8a --- /dev/null +++ b/ompi/mca/op/rocm/Makefile.am @@ -0,0 +1,73 @@ +# +# Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All rights +# reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# C sources (compiled by the normal C compiler) +sources = \ + op_rocm.h \ + op_rocm_component.c \ + op_rocm_session.c + +# The .cpp file is compiled separately by hipcc and linked in as a plain +# object. Libtool does not know how to compile HIP C++, so we use a custom +# rule. The resulting object is appended to LIBADD for both DSO and static +# builds. + +rocm_sources = op_rocm_kernels.cpp +CLEANFILES = op_rocm_kernels.o + +# Include paths forwarded to hipcc so it can find ompi_config.h and the +# op/mca headers. +HIPCC_INCLUDES = \ + -I$(top_srcdir) \ + -I$(top_builddir) \ + -I$(top_srcdir)/ompi \ + -I$(top_builddir)/ompi \ + -I$(top_builddir)/opal \ + -I$(top_builddir)/opal/include \ + -I$(top_srcdir)/ompi/include \ + -I$(top_srcdir)/opal/include + +op_rocm_kernels.l$(OBJEXT): $(srcdir)/op_rocm_kernels.cpp \ + $(srcdir)/op_rocm.h + $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile \ + $(HIPCC) $(HIPCCFLAGS) $(HIPCC_INCLUDES) \ + $(op_rocm_CPPFLAGS) \ + -c $< + +AM_CPPFLAGS = $(op_rocm_CPPFLAGS) + +# ---------------------------------------------------------------------------- +# DSO build +# ---------------------------------------------------------------------------- +if MCA_BUILD_ompi_op_rocm_DSO +component_install = mca_op_rocm.la +component_noinst = +else +component_install = +component_noinst = libmca_op_rocm.la +endif + +mcacomponentdir = $(ompilibdir) +mcacomponent_LTLIBRARIES = $(component_install) + +mca_op_rocm_la_SOURCES = $(sources) +mca_op_rocm_la_LDFLAGS = -module -avoid-version $(op_rocm_LDFLAGS) +mca_op_rocm_la_LIBADD = $(rocm_sources:.cpp=.lo) +mca_op_rocm_la_CPPFLAGS = $(op_rocm_CPPFLAGS) + +# ---------------------------------------------------------------------------- +# Static (convenience library) build +# ---------------------------------------------------------------------------- +noinst_LTLIBRARIES = $(component_noinst) + +libmca_op_rocm_la_SOURCES = $(sources) +libmca_op_rocm_la_LDFLAGS = -module -avoid-version $(op_rocm_LDFLAGS) +libmca_op_rocm_la_LIBADD = $(op_rocm_LIBS) $(rocm_sources:.cpp=.lo) +libmca_op_rocm_la_CPPFLAGS = $(op_rocm_CPPFLAGS) diff --git a/ompi/mca/op/rocm/configure.m4 b/ompi/mca/op/rocm/configure.m4 new file mode 100644 index 00000000000..6625aa06f7a --- /dev/null +++ b/ompi/mca/op/rocm/configure.m4 @@ -0,0 +1,70 @@ +# -*- shell-script -*- +# +# Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All rights +# reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# MCA_ompi_op_rocm_CONFIG([action-if-can-compile], +# [action-if-cant-compile]) +# ------------------------------------------------ +# Build the ROCm persistent-kernel op component only when the HIP runtime +# (libamdhip64 + hip/hip_runtime.h) and hipcc are available. +# +# Calls OPAL_CHECK_ROCM to locate headers and libraries, then separately +# finds hipcc. Sets: +# op_rocm_CPPFLAGS — include/define flags for HIP (includes -D__HIP_PLATFORM_AMD__) +# op_rocm_LDFLAGS — library search path for libamdhip64 +# op_rocm_LIBS — -lamdhip64 +# HIPCC — path to the hipcc compiler +# HIPCCFLAGS — default hipcc flags +# +AC_DEFUN([MCA_ompi_op_rocm_CONFIG],[ + AC_CONFIG_FILES([ompi/mca/op/rocm/Makefile]) + + OPAL_VAR_SCOPE_PUSH([op_rocm_happy op_rocm_hipcc_path]) + + op_rocm_happy=no + + # OPAL_CHECK_ROCM calls OAC_CHECK_PACKAGE and sets: + # op_rocm_CPPFLAGS, op_rocm_LDFLAGS, op_rocm_LIBS + # It also sets ROCM_SUPPORT=1 on success. + OPAL_CHECK_ROCM([op_rocm], + [op_rocm_happy=yes], + [op_rocm_happy=no]) + + # Find hipcc alongside the ROCm installation. + AS_IF([test "$op_rocm_happy" = "yes"], + [op_rocm_hipcc_path="$PATH" + AS_IF([test -n "$with_rocm" && test "$with_rocm" != "no" && test -d "$with_rocm/bin"], + [op_rocm_hipcc_path="$with_rocm/bin:$PATH"], + [AS_IF([test -d "/opt/rocm/bin"], + [op_rocm_hipcc_path="/opt/rocm/bin:$PATH"])]) + AC_PATH_PROG([HIPCC], [hipcc], [not_found], [$op_rocm_hipcc_path]) + AS_IF([test "$HIPCC" = "not_found"], + [AC_MSG_WARN([hipcc not found; skipping op/rocm component]) + op_rocm_happy=no]) + ]) + + OPAL_SUMMARY_ADD([Accelerators], [ROCm operator support], [], [$op_rocm_happy]) + # Default HIPCCFLAGS if not already set by the user. + AS_IF([test "$op_rocm_happy" = "yes" && test "x$HIPCCFLAGS" = "x"], + [HIPCCFLAGS="--offload-arch=gfx906"]) + + AC_SUBST([op_rocm_CPPFLAGS]) + AC_SUBST([op_rocm_LDFLAGS]) + AC_SUBST([op_rocm_LIBS]) + AC_SUBST([HIPCC]) + AC_SUBST([HIPCCFLAGS]) + + AS_IF([test "$op_rocm_happy" = "yes"], + [$1], + [$2]) + + OPAL_VAR_SCOPE_POP + +])dnl diff --git a/ompi/mca/op/rocm/op_rocm.h b/ompi/mca/op/rocm/op_rocm.h new file mode 100644 index 00000000000..df80d4f915e --- /dev/null +++ b/ompi/mca/op/rocm/op_rocm.h @@ -0,0 +1,66 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OMPI_MCA_OP_ROCM_H +#define OMPI_MCA_OP_ROCM_H + +#include "ompi_config.h" +#include + +#include "ompi/mca/op/op.h" +#include "ompi/op/op_gpu_session.h" /* defines ompi_op_gpu_cmd_t */ + +BEGIN_C_DECLS + +/** + * ROCm-specific cmd_queue. Inherits ompi_op_gpu_cmd_queue_t by placing it + * as the first member named "super". The HIP stream and shutdown flag are + * stored directly here rather than in a separate priv allocation. + * Allocated with OBJ_NEW; the OBJ destructor chain releases GPU resources. + */ +typedef struct ompi_op_rocm_cmd_queue_t { + ompi_op_gpu_cmd_queue_t super; /* MUST be first */ + volatile int32_t *shutdown; /* managed-memory shutdown flag */ + hipStream_t stream; /* private HIP stream */ +} ompi_op_rocm_cmd_queue_t; +OBJ_CLASS_DECLARATION(ompi_op_rocm_cmd_queue_t); + +/** + * Host-side launcher function type. + * Launches the persistent kernel for one (op, type) combination. + */ +typedef void (*ompi_op_rocm_launcher_fn_t)(ompi_op_gpu_cmd_t *cmd, + volatile int32_t *shutdown, + hipStream_t stream); + +/** + * 2D table [op_index][type_index] of launcher functions. + * NULL entries indicate unsupported (op, type) combinations; the session + * machinery returns NULL for those and the caller falls back to the host path. + * + * Indexed by OMPI_OP_BASE_FORTRAN_* × OMPI_OP_BASE_TYPE_*. + * Defined (and initialized) in op_rocm_kernels.cpp. + */ +OMPI_DECLSPEC extern ompi_op_rocm_launcher_fn_t +ompi_op_rocm_kernel_fns[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX]; + +/* Defined in op_rocm_kernels.cpp (extern "C") */ +void ompi_op_rocm_kernel_fns_init(void); + +/* Defined in op_rocm_session.c */ +ompi_op_gpu_cmd_queue_t *ompi_op_rocm_cmd_queue_alloc(int dev_id); +ompi_op_gpu_session_t *ompi_op_rocm_session_begin(ompi_op_gpu_cmd_queue_t *queue, + struct ompi_op_t *op, + struct ompi_datatype_t *dtype); + +END_C_DECLS + +#endif /* OMPI_MCA_OP_ROCM_H */ diff --git a/ompi/mca/op/rocm/op_rocm_component.c b/ompi/mca/op/rocm/op_rocm_component.c new file mode 100644 index 00000000000..7966a79f1e2 --- /dev/null +++ b/ompi/mca/op/rocm/op_rocm_component.c @@ -0,0 +1,96 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include + +#include "ompi/constants.h" +#include "ompi/op/op.h" +#include "ompi/mca/op/op.h" +#include "ompi/mca/op/base/base.h" +#include "ompi/op/op_gpu_session.h" +#include "ompi/mca/op/rocm/op_rocm.h" + + +static int rocm_component_open(void); +static int rocm_component_close(void); +static int rocm_component_init_query(bool enable_progress_threads, + bool enable_mpi_thread_multiple); +static struct ompi_op_base_module_1_0_0_t * + rocm_component_op_query(struct ompi_op_t *op, int *priority); + +/* + * Public component descriptor. + */ +ompi_op_base_component_1_0_0_t mca_op_rocm_component = { + .opc_version = { + OMPI_OP_BASE_VERSION_1_0_0, + + .mca_component_name = "rocm", + MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, + OMPI_RELEASE_VERSION), + .mca_open_component = rocm_component_open, + .mca_close_component = rocm_component_close, + }, + .opc_data = { + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + + .opc_init_query = rocm_component_init_query, + .opc_op_query = rocm_component_op_query, + + /* GPU session hooks */ + .opc_cmd_queue_alloc = ompi_op_rocm_cmd_queue_alloc, + .opc_session_begin = ompi_op_rocm_session_begin, +}; +MCA_BASE_COMPONENT_INIT(ompi, op, rocm) + +static int +rocm_component_open(void) +{ + ompi_op_rocm_kernel_fns_init(); + return OMPI_SUCCESS; +} + +static int +rocm_component_close(void) +{ + return OMPI_SUCCESS; +} + +/* + * Only activate this component when at least one ROCm-capable device is + * present in the system. + */ +static int +rocm_component_init_query(bool enable_progress_threads, + bool enable_mpi_thread_multiple) +{ + int device_count = 0; + hipError_t err = hipGetDeviceCount(&device_count); + if (hipSuccess != err || device_count <= 0) { + return OMPI_ERR_NOT_SUPPORTED; + } + return OMPI_SUCCESS; +} + +/* + * We do not provide per-op function pointers, only session hooks, so + * always return NULL here. + */ +static struct ompi_op_base_module_1_0_0_t * +rocm_component_op_query(struct ompi_op_t *op, int *priority) +{ + (void) op; + (void) priority; + return NULL; +} diff --git a/ompi/mca/op/rocm/op_rocm_kernels.cpp b/ompi/mca/op/rocm/op_rocm_kernels.cpp new file mode 100644 index 00000000000..d492957c002 --- /dev/null +++ b/ompi/mca/op/rocm/op_rocm_kernels.cpp @@ -0,0 +1,285 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/* + * Persistent reduction kernels for the ROCm op component. + * + * Structure mirrors op_cuda_kernels.cu; differences from the CUDA version: + * - #include instead of + * - __builtin_amdgcn_s_sleep(1) replaces __nanosleep(1000) for the + * low-power spin-wait (GCN instruction; ~64 clock sleep) + * - All cuda* API calls replaced by their hip* equivalents + * - Launcher macro uses ompi_op_rocm_persistent_* prefix + */ + +#include +#include + +#include "ompi/mca/op/op.h" +#include "ompi/mca/op/rocm/op_rocm.h" + +/* ------------------------------------------------------------------------- + * PERSISTENT_KERNEL(name, ctype, op_expr) + * + * Generates __global__ void ompi_op_rocm_persistent_(...). + * op_expr must be a statement writing dst[i] from src1[i] and src2[i], + * e.g. "dst[i] = src1[i] + src2[i]". src2 may alias dst for in-place ops. + * ------------------------------------------------------------------------- */ +#define PERSISTENT_KERNEL(kname, ctype, op_expr) \ +__global__ void ompi_op_rocm_persistent_##kname( \ + ompi_op_gpu_cmd_t *cmd, volatile int32_t *shutdown) \ +{ \ + while (!*shutdown) { \ + /* Spin-wait for work; sleep ~64 clocks between polls to save power */ \ + while (cmd->status != 1 && !*shutdown) { \ + __builtin_amdgcn_s_sleep(1); \ + } \ + if (*shutdown) break; \ + const ctype * __restrict__ src1 = (const ctype *) cmd->src1; \ + const ctype * __restrict__ src2 = (const ctype *) cmd->src2; \ + ctype * __restrict__ dst = ( ctype *) cmd->dst; \ + int64_t n = cmd->count; \ + for (int64_t i = (int64_t)threadIdx.x; i < n; i += blockDim.x) { \ + op_expr; \ + } \ + __syncthreads(); \ + if (threadIdx.x == 0) { \ + __threadfence_system(); /* ensure dst writes reach host */ \ + cmd->status = 2; /* signal done */ \ + } \ + } \ +} + +/* ========================================================================= + * Kernel instantiations + * ========================================================================= */ + +/* --- MAX --- */ +PERSISTENT_KERNEL(max_int8, int8_t, dst[i] = src1[i] > src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(max_uint8, uint8_t, dst[i] = src1[i] > src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(max_int16, int16_t, dst[i] = src1[i] > src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(max_uint16, uint16_t, dst[i] = src1[i] > src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(max_int32, int32_t, dst[i] = src1[i] > src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(max_uint32, uint32_t, dst[i] = src1[i] > src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(max_int64, int64_t, dst[i] = src1[i] > src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(max_uint64, uint64_t, dst[i] = src1[i] > src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(max_float, float, dst[i] = src1[i] > src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(max_double, double, dst[i] = src1[i] > src2[i] ? src1[i] : src2[i]) + +/* --- MIN --- */ +PERSISTENT_KERNEL(min_int8, int8_t, dst[i] = src1[i] < src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(min_uint8, uint8_t, dst[i] = src1[i] < src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(min_int16, int16_t, dst[i] = src1[i] < src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(min_uint16, uint16_t, dst[i] = src1[i] < src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(min_int32, int32_t, dst[i] = src1[i] < src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(min_uint32, uint32_t, dst[i] = src1[i] < src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(min_int64, int64_t, dst[i] = src1[i] < src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(min_uint64, uint64_t, dst[i] = src1[i] < src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(min_float, float, dst[i] = src1[i] < src2[i] ? src1[i] : src2[i]) +PERSISTENT_KERNEL(min_double, double, dst[i] = src1[i] < src2[i] ? src1[i] : src2[i]) + +/* --- SUM --- */ +PERSISTENT_KERNEL(sum_int8, int8_t, dst[i] = src1[i] + src2[i]) +PERSISTENT_KERNEL(sum_uint8, uint8_t, dst[i] = src1[i] + src2[i]) +PERSISTENT_KERNEL(sum_int16, int16_t, dst[i] = src1[i] + src2[i]) +PERSISTENT_KERNEL(sum_uint16, uint16_t, dst[i] = src1[i] + src2[i]) +PERSISTENT_KERNEL(sum_int32, int32_t, dst[i] = src1[i] + src2[i]) +PERSISTENT_KERNEL(sum_uint32, uint32_t, dst[i] = src1[i] + src2[i]) +PERSISTENT_KERNEL(sum_int64, int64_t, dst[i] = src1[i] + src2[i]) +PERSISTENT_KERNEL(sum_uint64, uint64_t, dst[i] = src1[i] + src2[i]) +PERSISTENT_KERNEL(sum_float, float, dst[i] = src1[i] + src2[i]) +PERSISTENT_KERNEL(sum_double, double, dst[i] = src1[i] + src2[i]) + +/* --- PROD --- */ +PERSISTENT_KERNEL(prod_int8, int8_t, dst[i] = src1[i] * src2[i]) +PERSISTENT_KERNEL(prod_uint8, uint8_t, dst[i] = src1[i] * src2[i]) +PERSISTENT_KERNEL(prod_int16, int16_t, dst[i] = src1[i] * src2[i]) +PERSISTENT_KERNEL(prod_uint16, uint16_t, dst[i] = src1[i] * src2[i]) +PERSISTENT_KERNEL(prod_int32, int32_t, dst[i] = src1[i] * src2[i]) +PERSISTENT_KERNEL(prod_uint32, uint32_t, dst[i] = src1[i] * src2[i]) +PERSISTENT_KERNEL(prod_int64, int64_t, dst[i] = src1[i] * src2[i]) +PERSISTENT_KERNEL(prod_uint64, uint64_t, dst[i] = src1[i] * src2[i]) +PERSISTENT_KERNEL(prod_float, float, dst[i] = src1[i] * src2[i]) +PERSISTENT_KERNEL(prod_double, double, dst[i] = src1[i] * src2[i]) + +/* --- BAND (bitwise AND, integer types only) --- */ +PERSISTENT_KERNEL(band_int8, int8_t, dst[i] = src1[i] & src2[i]) +PERSISTENT_KERNEL(band_uint8, uint8_t, dst[i] = src1[i] & src2[i]) +PERSISTENT_KERNEL(band_int16, int16_t, dst[i] = src1[i] & src2[i]) +PERSISTENT_KERNEL(band_uint16, uint16_t, dst[i] = src1[i] & src2[i]) +PERSISTENT_KERNEL(band_int32, int32_t, dst[i] = src1[i] & src2[i]) +PERSISTENT_KERNEL(band_uint32, uint32_t, dst[i] = src1[i] & src2[i]) +PERSISTENT_KERNEL(band_int64, int64_t, dst[i] = src1[i] & src2[i]) +PERSISTENT_KERNEL(band_uint64, uint64_t, dst[i] = src1[i] & src2[i]) + +/* --- BOR (bitwise OR) --- */ +PERSISTENT_KERNEL(bor_int8, int8_t, dst[i] = src1[i] | src2[i]) +PERSISTENT_KERNEL(bor_uint8, uint8_t, dst[i] = src1[i] | src2[i]) +PERSISTENT_KERNEL(bor_int16, int16_t, dst[i] = src1[i] | src2[i]) +PERSISTENT_KERNEL(bor_uint16, uint16_t, dst[i] = src1[i] | src2[i]) +PERSISTENT_KERNEL(bor_int32, int32_t, dst[i] = src1[i] | src2[i]) +PERSISTENT_KERNEL(bor_uint32, uint32_t, dst[i] = src1[i] | src2[i]) +PERSISTENT_KERNEL(bor_int64, int64_t, dst[i] = src1[i] | src2[i]) +PERSISTENT_KERNEL(bor_uint64, uint64_t, dst[i] = src1[i] | src2[i]) + +/* --- BXOR (bitwise XOR) --- */ +PERSISTENT_KERNEL(bxor_int8, int8_t, dst[i] = src1[i] ^ src2[i]) +PERSISTENT_KERNEL(bxor_uint8, uint8_t, dst[i] = src1[i] ^ src2[i]) +PERSISTENT_KERNEL(bxor_int16, int16_t, dst[i] = src1[i] ^ src2[i]) +PERSISTENT_KERNEL(bxor_uint16, uint16_t, dst[i] = src1[i] ^ src2[i]) +PERSISTENT_KERNEL(bxor_int32, int32_t, dst[i] = src1[i] ^ src2[i]) +PERSISTENT_KERNEL(bxor_uint32, uint32_t, dst[i] = src1[i] ^ src2[i]) +PERSISTENT_KERNEL(bxor_int64, int64_t, dst[i] = src1[i] ^ src2[i]) +PERSISTENT_KERNEL(bxor_uint64, uint64_t, dst[i] = src1[i] ^ src2[i]) + +/* ========================================================================= + * Host-side launcher wrappers — one per kernel, 1 block × 256 threads. + * ========================================================================= */ +#define LAUNCHER(kname) \ +static void launch_##kname(ompi_op_gpu_cmd_t *cmd, \ + volatile int32_t *sd, \ + hipStream_t stream) \ +{ \ + ompi_op_rocm_persistent_##kname<<<1, 256, 0, stream>>>(cmd, sd); \ +} + +LAUNCHER(max_int8) LAUNCHER(max_uint8) +LAUNCHER(max_int16) LAUNCHER(max_uint16) +LAUNCHER(max_int32) LAUNCHER(max_uint32) +LAUNCHER(max_int64) LAUNCHER(max_uint64) +LAUNCHER(max_float) LAUNCHER(max_double) + +LAUNCHER(min_int8) LAUNCHER(min_uint8) +LAUNCHER(min_int16) LAUNCHER(min_uint16) +LAUNCHER(min_int32) LAUNCHER(min_uint32) +LAUNCHER(min_int64) LAUNCHER(min_uint64) +LAUNCHER(min_float) LAUNCHER(min_double) + +LAUNCHER(sum_int8) LAUNCHER(sum_uint8) +LAUNCHER(sum_int16) LAUNCHER(sum_uint16) +LAUNCHER(sum_int32) LAUNCHER(sum_uint32) +LAUNCHER(sum_int64) LAUNCHER(sum_uint64) +LAUNCHER(sum_float) LAUNCHER(sum_double) + +LAUNCHER(prod_int8) LAUNCHER(prod_uint8) +LAUNCHER(prod_int16) LAUNCHER(prod_uint16) +LAUNCHER(prod_int32) LAUNCHER(prod_uint32) +LAUNCHER(prod_int64) LAUNCHER(prod_uint64) +LAUNCHER(prod_float) LAUNCHER(prod_double) + +LAUNCHER(band_int8) LAUNCHER(band_uint8) +LAUNCHER(band_int16) LAUNCHER(band_uint16) +LAUNCHER(band_int32) LAUNCHER(band_uint32) +LAUNCHER(band_int64) LAUNCHER(band_uint64) + +LAUNCHER(bor_int8) LAUNCHER(bor_uint8) +LAUNCHER(bor_int16) LAUNCHER(bor_uint16) +LAUNCHER(bor_int32) LAUNCHER(bor_uint32) +LAUNCHER(bor_int64) LAUNCHER(bor_uint64) + +LAUNCHER(bxor_int8) LAUNCHER(bxor_uint8) +LAUNCHER(bxor_int16) LAUNCHER(bxor_uint16) +LAUNCHER(bxor_int32) LAUNCHER(bxor_uint32) +LAUNCHER(bxor_int64) LAUNCHER(bxor_uint64) + +/* ========================================================================= + * 2D launcher table [op_index][type_index] + * + * Indexed by OMPI_OP_BASE_FORTRAN_* (rows) × OMPI_OP_BASE_TYPE_* (columns). + * Zero/NULL entries mean "not supported on GPU" → host fallback. + * + * Zero-initialized here; filled by ompi_op_rocm_kernel_fns_init() called + * from rocm_component_open(). Plain assignment avoids non-trivial designated + * initializers which are not supported by all GPU compiler C++ frontends. + * ========================================================================= */ +ompi_op_rocm_launcher_fn_t +ompi_op_rocm_kernel_fns[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX]; + +void +ompi_op_rocm_kernel_fns_init(void) +{ +#define SET(op, type, fn) \ + ompi_op_rocm_kernel_fns[OMPI_OP_BASE_FORTRAN_##op][OMPI_OP_BASE_TYPE_##type] = (fn) + + SET(MAX, INT8_T, launch_max_int8); + SET(MAX, UINT8_T, launch_max_uint8); + SET(MAX, INT16_T, launch_max_int16); + SET(MAX, UINT16_T, launch_max_uint16); + SET(MAX, INT32_T, launch_max_int32); + SET(MAX, UINT32_T, launch_max_uint32); + SET(MAX, INT64_T, launch_max_int64); + SET(MAX, UINT64_T, launch_max_uint64); + SET(MAX, FLOAT, launch_max_float); + SET(MAX, DOUBLE, launch_max_double); + + SET(MIN, INT8_T, launch_min_int8); + SET(MIN, UINT8_T, launch_min_uint8); + SET(MIN, INT16_T, launch_min_int16); + SET(MIN, UINT16_T, launch_min_uint16); + SET(MIN, INT32_T, launch_min_int32); + SET(MIN, UINT32_T, launch_min_uint32); + SET(MIN, INT64_T, launch_min_int64); + SET(MIN, UINT64_T, launch_min_uint64); + SET(MIN, FLOAT, launch_min_float); + SET(MIN, DOUBLE, launch_min_double); + + SET(SUM, INT8_T, launch_sum_int8); + SET(SUM, UINT8_T, launch_sum_uint8); + SET(SUM, INT16_T, launch_sum_int16); + SET(SUM, UINT16_T, launch_sum_uint16); + SET(SUM, INT32_T, launch_sum_int32); + SET(SUM, UINT32_T, launch_sum_uint32); + SET(SUM, INT64_T, launch_sum_int64); + SET(SUM, UINT64_T, launch_sum_uint64); + SET(SUM, FLOAT, launch_sum_float); + SET(SUM, DOUBLE, launch_sum_double); + + SET(PROD, INT8_T, launch_prod_int8); + SET(PROD, UINT8_T, launch_prod_uint8); + SET(PROD, INT16_T, launch_prod_int16); + SET(PROD, UINT16_T, launch_prod_uint16); + SET(PROD, INT32_T, launch_prod_int32); + SET(PROD, UINT32_T, launch_prod_uint32); + SET(PROD, INT64_T, launch_prod_int64); + SET(PROD, UINT64_T, launch_prod_uint64); + SET(PROD, FLOAT, launch_prod_float); + SET(PROD, DOUBLE, launch_prod_double); + + SET(BAND, INT8_T, launch_band_int8); + SET(BAND, UINT8_T, launch_band_uint8); + SET(BAND, INT16_T, launch_band_int16); + SET(BAND, UINT16_T, launch_band_uint16); + SET(BAND, INT32_T, launch_band_int32); + SET(BAND, UINT32_T, launch_band_uint32); + SET(BAND, INT64_T, launch_band_int64); + SET(BAND, UINT64_T, launch_band_uint64); + + SET(BOR, INT8_T, launch_bor_int8); + SET(BOR, UINT8_T, launch_bor_uint8); + SET(BOR, INT16_T, launch_bor_int16); + SET(BOR, UINT16_T, launch_bor_uint16); + SET(BOR, INT32_T, launch_bor_int32); + SET(BOR, UINT32_T, launch_bor_uint32); + SET(BOR, INT64_T, launch_bor_int64); + SET(BOR, UINT64_T, launch_bor_uint64); + + SET(BXOR, INT8_T, launch_bxor_int8); + SET(BXOR, UINT8_T, launch_bxor_uint8); + SET(BXOR, INT16_T, launch_bxor_int16); + SET(BXOR, UINT16_T, launch_bxor_uint16); + SET(BXOR, INT32_T, launch_bxor_int32); + SET(BXOR, UINT32_T, launch_bxor_uint32); + SET(BXOR, INT64_T, launch_bxor_int64); + SET(BXOR, UINT64_T, launch_bxor_uint64); + + /* LAND, LOR, LXOR, MAXLOC, MINLOC, REPLACE, NO_OP: NULL → host path */ +#undef SET +} diff --git a/ompi/mca/op/rocm/op_rocm_session.c b/ompi/mca/op/rocm/op_rocm_session.c new file mode 100644 index 00000000000..b445d5fc3f5 --- /dev/null +++ b/ompi/mca/op/rocm/op_rocm_session.c @@ -0,0 +1,239 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/* + * Session lifecycle for the ROCm persistent-kernel op component. + * Mirrors op_cuda_session.c with hip* API calls in place of cuda*. + * + * ompi_op_rocm_cmd_queue_t inherits ompi_op_gpu_cmd_queue_t. OBJ_NEW + * allocates the object; the OBJ destructor releases the HIP stream and + * managed memory. The component returns the base pointer from alloc so + * callers need no knowledge of the concrete type. + * + * session_begin: look up the kernel for (op, dtype), reset the cmd_queue + * state, and launch the persistent kernel on the existing + * stream. Wires all session dispatch hooks and returns the + * session. Returns NULL if no kernel exists. + * + * session_reduce: write src/dst/count to the command slot, set status=1 + * to wake the kernel, and spin until status==2. + * + * session_stop: signal the persistent kernel to exit and synchronize the + * stream. The cmd_queue's HIP stream and managed memory + * remain allocated for reuse. + */ + +#include "ompi_config.h" +#include +#include +#include + +#include + +#include "opal/mca/accelerator/base/base.h" +#include "ompi/op/op.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/op/op_gpu_session.h" +#include "ompi/mca/op/op.h" +#include "ompi/mca/op/rocm/op_rocm.h" + +/* ompi_op_ddt_map[] maps dtype->id → OMPI_OP_BASE_TYPE_* (-1 if none) */ +extern int ompi_op_ddt_map[OMPI_DATATYPE_MAX_PREDEFINED]; + +/* Forward declarations of static session hooks referenced from session_begin. */ +static void ompi_op_rocm_session_reduce(ompi_op_gpu_session_t *session, + const void *src1, const void *src2, + void *dst, size_t count); +static void ompi_op_rocm_session_stop(ompi_op_gpu_session_t *session); + +/* -------------------------------------------------------------------------- + * OBJ constructor / destructor for ompi_op_rocm_cmd_queue_t + * -------------------------------------------------------------------------- */ +static void +ompi_op_rocm_cmd_queue_construct(ompi_op_rocm_cmd_queue_t *q) +{ + q->shutdown = NULL; + q->stream = NULL; + q->super.cmd = NULL; + q->super.dev_id = -1; + q->super.allocator = NULL; + q->super.session_begin_fn = NULL; +} + +static void +ompi_op_rocm_cmd_queue_destruct(ompi_op_rocm_cmd_queue_t *q) +{ + if (NULL != q->stream) { + hipStreamDestroy(q->stream); + q->stream = NULL; + } + if (NULL != q->shutdown) { + hipFree((void *) q->shutdown); + q->shutdown = NULL; + } + if (NULL != q->super.cmd) { + hipFree(q->super.cmd); + q->super.cmd = NULL; + } +} + +OBJ_CLASS_INSTANCE(ompi_op_rocm_cmd_queue_t, + ompi_op_gpu_cmd_queue_t, + ompi_op_rocm_cmd_queue_construct, + ompi_op_rocm_cmd_queue_destruct); + +/* -------------------------------------------------------------------------- + * ompi_op_rocm_cmd_queue_alloc + * -------------------------------------------------------------------------- */ +ompi_op_gpu_cmd_queue_t * +ompi_op_rocm_cmd_queue_alloc(int dev_id) +{ + ompi_op_rocm_cmd_queue_t *q = OBJ_NEW(ompi_op_rocm_cmd_queue_t); + if (NULL == q) { + return NULL; + } + + hipError_t err; + + /* Allocate managed-memory command slot (accessible by both CPU and GPU) */ + err = hipMallocManaged((void **) &q->super.cmd, + sizeof(ompi_op_gpu_cmd_t), + hipMemAttachGlobal); + if (hipSuccess != err) { + OBJ_RELEASE(q); + return NULL; + } + q->super.cmd->src1 = NULL; + q->super.cmd->src2 = NULL; + q->super.cmd->dst = NULL; + q->super.cmd->count = 0; + q->super.cmd->status = 0; + + /* Allocate managed-memory shutdown flag */ + err = hipMallocManaged((void **) &q->shutdown, + sizeof(int32_t), + hipMemAttachGlobal); + if (hipSuccess != err) { + OBJ_RELEASE(q); + return NULL; + } + *q->shutdown = 0; + + /* Create a dedicated non-blocking stream for this cmd_queue */ + err = hipStreamCreateWithFlags(&q->stream, hipStreamNonBlocking); + if (hipSuccess != err) { + OBJ_RELEASE(q); + return NULL; + } + + q->super.dev_id = dev_id; + q->super.allocator = opal_accelerator_base_get_device_allocator(dev_id); + return &q->super; +} + +/* -------------------------------------------------------------------------- + * ompi_op_rocm_session_begin + * -------------------------------------------------------------------------- */ +ompi_op_gpu_session_t * +ompi_op_rocm_session_begin(ompi_op_gpu_cmd_queue_t *queue, + struct ompi_op_t *op, + struct ompi_datatype_t *dtype) +{ + int op_idx = op->o_f_to_c_index; + int type_idx = (dtype->id < OMPI_DATATYPE_MAX_PREDEFINED) + ? ompi_op_ddt_map[dtype->id] : -1; + + if (op_idx < 0 || op_idx >= OMPI_OP_BASE_FORTRAN_OP_MAX || + type_idx < 0 || type_idx >= OMPI_OP_BASE_TYPE_MAX) { + return NULL; + } + + ompi_op_rocm_launcher_fn_t launcher = ompi_op_rocm_kernel_fns[op_idx][type_idx]; + if (NULL == launcher) { + return NULL; + } + + ompi_op_rocm_cmd_queue_t *cq = (ompi_op_rocm_cmd_queue_t *) queue; + + /* Reset queue state for the new kernel */ + *cq->shutdown = 0; + queue->cmd->src1 = NULL; + queue->cmd->src2 = NULL; + queue->cmd->dst = NULL; + queue->cmd->count = 0; + queue->cmd->status = 0; + + /* Launch the persistent kernel (1 block, 256 threads) */ + launcher(queue->cmd, cq->shutdown, cq->stream); + hipError_t err = hipGetLastError(); + if (hipSuccess != err) { + return NULL; + } + + ompi_op_gpu_session_t *session = + (ompi_op_gpu_session_t *) malloc(sizeof(ompi_op_gpu_session_t)); + if (NULL == session) { + return NULL; + } + + session->queue = queue; + session->allocator = queue->allocator; + session->reduce_fn = ompi_op_rocm_session_reduce; + session->stop_fn = ompi_op_rocm_session_stop; + return session; +} + +/* -------------------------------------------------------------------------- + * ompi_op_rocm_session_reduce + * -------------------------------------------------------------------------- */ +static void +ompi_op_rocm_session_reduce(ompi_op_gpu_session_t *session, + const void *src1, const void *src2, + void *dst, size_t count) +{ + ompi_op_gpu_cmd_t *cmd = session->queue->cmd; + + /* Write operands before signalling the kernel */ + cmd->src1 = src1; + cmd->src2 = src2; + cmd->dst = dst; + cmd->count = (int64_t) count; + + __atomic_thread_fence(__ATOMIC_SEQ_CST); /* ensure writes visible to GPU */ + cmd->status = 1; /* wake the kernel */ + + /* Spin-wait for the kernel to signal completion */ + while (2 != cmd->status) { + sched_yield(); /* relinquish CPU timeslice while waiting */ + } + + /* Reset for the next call */ + cmd->status = 0; +} + +/* -------------------------------------------------------------------------- + * ompi_op_rocm_session_stop + * + * Signal the persistent kernel to exit and wait for the stream to drain. + * The cmd_queue's stream and managed memory remain allocated for reuse. + * -------------------------------------------------------------------------- */ +static void +ompi_op_rocm_session_stop(ompi_op_gpu_session_t *session) +{ + ompi_op_rocm_cmd_queue_t *cq = (ompi_op_rocm_cmd_queue_t *) session->queue; + + /* Signal the kernel to exit its loop */ + *cq->shutdown = 1; + __atomic_thread_fence(__ATOMIC_SEQ_CST); + + /* Wait for the kernel to finish; stream remains valid after this */ + hipStreamSynchronize(cq->stream); +} diff --git a/ompi/op/Makefile.am b/ompi/op/Makefile.am index 5599c31311b..b86bb1a3965 100644 --- a/ompi/op/Makefile.am +++ b/ompi/op/Makefile.am @@ -23,5 +23,7 @@ # ompi/Makefile.am headers += op/op.h +headers += op/op_gpu_session.h lib@OMPI_LIBMPI_NAME@_la_SOURCES += op/op.c +lib@OMPI_LIBMPI_NAME@_la_SOURCES += op/op_gpu_session.c diff --git a/ompi/op/op_gpu_session.c b/ompi/op/op_gpu_session.c new file mode 100644 index 00000000000..f9217764e1f --- /dev/null +++ b/ompi/op/op_gpu_session.c @@ -0,0 +1,226 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/* + * Dispatcher and cmd_queue pool for GPU reduction sessions. + * + * The expensive GPU resources — managed-memory command slot, shutdown flag, + * and private GPU stream — are bundled into an ompi_op_gpu_cmd_queue_t and + * pooled by dev_id. Sessions themselves are lightweight structs (function + * pointers + a pointer to the cmd_queue) and are allocated fresh for each + * collective. + * + * Pool implementation: + * cmd_queue_pool — opal_lifo_t providing lock-free thread-safe push/pop. + * cmd_queue_pool_count — atomic counter tracking current pool depth; + * used to enforce CMD_QUEUE_POOL_MAX without a mutex. + * + * Pool lifecycle: + * session_end() — stops the persistent kernel (cmd_queue resources remain + * allocated), then pushes the cmd_queue into the lifo pool. + * session_begin() — pops from the lifo looking for a matching dev_id entry + * and calls queue->session_begin_fn(queue, op, dtype) to + * configure and relaunch the kernel; no cudaMalloc overhead. + * On pool miss, iterates op components to allocate a fresh + * cmd_queue, then calls session_begin. + * On pool hit with no matching dev_id, the queue is pushed + * back and a fresh allocation is attempted. + * On pool hit with no kernel for (op, dtype), the queue is + * returned to the pool and NULL is returned. + * + * CMD_QUEUE_POOL_MAX caps the total number of idle cmd_queues to bound GPU + * resource accumulation. + */ + +#include "ompi_config.h" + +#include + +#include "opal/class/opal_lifo.h" +#include "opal/class/opal_list.h" +#include "opal/mca/accelerator/base/base.h" +#include "opal/mca/base/base.h" +#include "opal/sys/atomic.h" +#include "ompi/mca/op/op.h" +#include "ompi/mca/op/base/base.h" +#include "ompi/op/op_gpu_session.h" +#include "ompi/op/op.h" + +OBJ_CLASS_INSTANCE(ompi_op_gpu_cmd_queue_t, opal_list_item_t, NULL, NULL); + +/* Maximum number of idle cmd_queues kept in the pool. */ +#define CMD_QUEUE_POOL_MAX 16 + +static opal_lifo_t cmd_queue_pool; +static opal_atomic_int32_t cmd_queue_pool_count = 0; + +/* -------------------------------------------------------------------------- + * cmd_queue_destroy — permanently release a cmd_queue's GPU resources. + * OBJ_RELEASE dispatches through the concrete class destructor chain + * (e.g. ompi_op_cuda_cmd_queue_t) and frees the allocation. + * -------------------------------------------------------------------------- */ +static void +cmd_queue_destroy(ompi_op_gpu_cmd_queue_t *queue) +{ + OBJ_RELEASE(queue); +} + +/* -------------------------------------------------------------------------- + * cmd_queue_pool_push — return a cmd_queue to the pool. + * Destroys the queue instead if the pool is already at capacity. + * -------------------------------------------------------------------------- */ +static void +cmd_queue_pool_push(ompi_op_gpu_cmd_queue_t *queue) +{ + if (opal_atomic_add_fetch_32(&cmd_queue_pool_count, 1) <= CMD_QUEUE_POOL_MAX) { + opal_lifo_push(&cmd_queue_pool, &queue->super); + } else { + opal_atomic_add_fetch_32(&cmd_queue_pool_count, -1); + cmd_queue_destroy(queue); + } +} + +/* -------------------------------------------------------------------------- + * ompi_op_gpu_session_pool_init + * -------------------------------------------------------------------------- */ +void +ompi_op_gpu_session_pool_init(void) +{ + OBJ_CONSTRUCT(&cmd_queue_pool, opal_lifo_t); +} + +/* -------------------------------------------------------------------------- + * ompi_op_gpu_session_begin + * + * 1. Pop one entry from the lifo pool. + * 2. If dev_id matches: call queue->session_begin_fn to configure and + * relaunch the kernel. On success return the session. On failure + * (no kernel for this op/dtype), push the queue back and return NULL. + * 3. If dev_id doesn't match: push the queue back and fall through to + * fresh allocation. + * 4. Pool miss: iterate op components to allocate a fresh cmd_queue and + * call opc_session_begin. + * -------------------------------------------------------------------------- */ +ompi_op_gpu_session_t * +ompi_op_gpu_session_begin(struct ompi_op_t *op, + struct ompi_datatype_t *dtype, + int dev_id) +{ + /* Check pool for a reusable cmd_queue. */ + opal_list_item_t *item = opal_lifo_pop(&cmd_queue_pool); + if (NULL != item) { + opal_atomic_add_fetch_32(&cmd_queue_pool_count, -1); + ompi_op_gpu_cmd_queue_t *q = (ompi_op_gpu_cmd_queue_t *) item; + + if (q->dev_id == dev_id) { + ompi_op_gpu_session_t *s = q->session_begin_fn(q, op, dtype); + if (NULL != s) { + return s; + } + /* No GPU kernel for this (op, dtype). Return the cmd_queue to + * the pool so it can be reused for a future combination that does + * have a kernel. Caller falls back to ompi_op_reduce(). */ + cmd_queue_pool_push(q); + return NULL; + } + + /* Wrong device — push back and fall through to fresh allocation. */ + cmd_queue_pool_push(q); + } + + /* Pool miss (or wrong device) — allocate a fresh cmd_queue. */ + mca_base_component_list_item_t *cli; + OPAL_LIST_FOREACH(cli, &ompi_op_base_framework.framework_components, + mca_base_component_list_item_t) { + const mca_base_component_t *bc = cli->cli_component; + + if (1 != bc->mca_type_major_version || + 0 != bc->mca_type_minor_version || + 0 != bc->mca_type_release_version) { + continue; + } + + const ompi_op_base_component_1_0_0_t *opc = + (const ompi_op_base_component_1_0_0_t *) bc; + + if (NULL == opc->opc_cmd_queue_alloc || + NULL == opc->opc_session_begin) { + continue; + } + + ompi_op_gpu_cmd_queue_t *q = opc->opc_cmd_queue_alloc(dev_id); + if (NULL == q) { + continue; + } + + /* Wire session_begin_fn into the cmd_queue. */ + q->session_begin_fn = opc->opc_session_begin; + + ompi_op_gpu_session_t *session = opc->opc_session_begin(q, op, dtype); + if (NULL == session) { + /* This component has no kernel for (op, dtype); discard the queue. */ + cmd_queue_destroy(q); + continue; + } + + return session; + } + + return NULL; +} + +/* -------------------------------------------------------------------------- + * ompi_op_gpu_session_reduce + * -------------------------------------------------------------------------- */ +void +ompi_op_gpu_session_reduce(ompi_op_gpu_session_t *session, + const void *src1, const void *src2, + void *dst, size_t count) +{ + session->reduce_fn(session, src1, src2, dst, count); +} + +/* -------------------------------------------------------------------------- + * ompi_op_gpu_session_end + * + * Stop the persistent kernel and return the cmd_queue to the pool. + * -------------------------------------------------------------------------- */ +void +ompi_op_gpu_session_end(ompi_op_gpu_session_t *session) +{ + if (NULL == session) { + return; + } + + /* Signal the kernel to exit and wait for the stream to drain. */ + session->stop_fn(session); + + ompi_op_gpu_cmd_queue_t *q = session->queue; + free(session); + + cmd_queue_pool_push(q); +} + +/* -------------------------------------------------------------------------- + * ompi_op_gpu_session_pool_finalize + * + * Drain the pool, release all GPU resources, and destroy the lifo. + * Called once from ompi_op_base_close() during MPI_Finalize. + * -------------------------------------------------------------------------- */ +void +ompi_op_gpu_session_pool_finalize(void) +{ + opal_list_item_t *item; + while (NULL != (item = opal_lifo_pop(&cmd_queue_pool))) { + cmd_queue_destroy((ompi_op_gpu_cmd_queue_t *) item); + } + OBJ_DESTRUCT(&cmd_queue_pool); +} diff --git a/ompi/op/op_gpu_session.h b/ompi/op/op_gpu_session.h new file mode 100644 index 00000000000..36cba08fefb --- /dev/null +++ b/ompi/op/op_gpu_session.h @@ -0,0 +1,135 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OMPI_OP_GPU_SESSION_H +#define OMPI_OP_GPU_SESSION_H + +#include "ompi_config.h" +#include +#include +#include "opal/class/opal_list.h" +#include "opal/mca/allocator/allocator.h" + +BEGIN_C_DECLS + +struct ompi_op_t; +struct ompi_datatype_t; + +/** + * Managed-memory command slot shared between the host and the persistent + * reduction kernel (accessible by both CPU and GPU via managed/unified memory). + * + * status lifecycle (per reduction call): + * 0 = idle (initial; host resets after kernel signals done) + * 1 = work_ready (host → kernel: pointers and count are valid) + * 2 = done (kernel → host: reduction complete) + */ +typedef struct { + const void *src1; + const void *src2; + void *dst; + int64_t count; + volatile int32_t status; +} ompi_op_gpu_cmd_t; + +/** + * The expensive-to-create GPU resources needed by a persistent reduction + * kernel: a managed-memory command slot and a private GPU stream. Pooled + * by dev_id so they can be reused across collectives without paying + * cudaMallocManaged/hipMallocManaged overhead on every call. + * + * GPU components (cuda, rocm) inherit from this base by placing it as the + * first member named "super" in their own cmd_queue struct, then allocate + * with OBJ_NEW and return the base pointer. Destruction (including GPU + * resource cleanup) is dispatched automatically through the OBJ class chain. + * + * session_begin_fn is wired at cmd_queue_alloc time by the component. + */ +typedef struct ompi_op_gpu_cmd_queue_t { + opal_list_item_t super; /* MUST be first: used by opal_lifo_t pool */ + int dev_id; + mca_allocator_base_module_t *allocator; /* GPU scratch allocator for this device */ + ompi_op_gpu_cmd_t *cmd; /* managed memory — shared with GPU */ + /* Session creation hook — wired at cmd_queue_alloc time by the component. */ + struct ompi_op_gpu_session_t *(*session_begin_fn)( + struct ompi_op_gpu_cmd_queue_t *queue, + struct ompi_op_t *op, + struct ompi_datatype_t *dtype); +} ompi_op_gpu_cmd_queue_t; +OBJ_CLASS_DECLARATION(ompi_op_gpu_cmd_queue_t); + +/** + * Per-collective GPU reduction session. Created by ompi_op_gpu_session_begin() + * before a collective algorithm's reduction loop, and destroyed (with its + * cmd_queue recycled to the pool) by ompi_op_gpu_session_end(). + * + * Sessions are lightweight: all expensive GPU resources (managed memory, + * GPU stream) live in the cmd_queue, which is pooled separately. The session + * holds only a pointer to the cmd_queue and the dispatch function pointers. + * + * The component's opc_session_begin wires queue, allocator, reduce_fn, and + * stop_fn. Callers must not set these fields directly. + * + * When no GPU op component supports the (op, dtype) combination, begin() + * returns NULL and all callers fall back to ompi_op_reduce(). + */ +typedef struct ompi_op_gpu_session_t { + ompi_op_gpu_cmd_queue_t *queue; + mca_allocator_base_module_t *allocator; /* GPU scratch allocator (= queue->allocator) */ + /* Dispatch hooks wired by the component's opc_session_begin. */ + void (*reduce_fn)(struct ompi_op_gpu_session_t *session, + const void *src1, const void *src2, void *dst, size_t count); + /* Signal the persistent kernel to exit and synchronize the stream. + * The cmd_queue's resources remain valid for reuse after this call. */ + void (*stop_fn)(struct ompi_op_gpu_session_t *session); +} ompi_op_gpu_session_t; + +/** + * Create a GPU reduction session and launch a persistent reduction kernel. + * Returns NULL if no GPU op component supports this (op, dtype) combination + * or if no GPU op component is loaded — the caller must then use ompi_op_reduce. + */ +OMPI_DECLSPEC ompi_op_gpu_session_t *ompi_op_gpu_session_begin(struct ompi_op_t *op, + struct ompi_datatype_t *dtype, + int dev_id); + +/** + * Post one reduction command (src1 op src2 → dst) to the persistent kernel and + * wait for completion. src2 may alias dst for in-place operations. + * Behavior is undefined if session is NULL. + */ +OMPI_DECLSPEC void ompi_op_gpu_session_reduce(ompi_op_gpu_session_t *session, + const void *src1, const void *src2, + void *dst, size_t count); + +/** + * Stop the persistent kernel and return the session's cmd_queue to the pool + * for reuse. GPU stream and managed memory remain allocated; a future begin() + * call for the same dev_id will relaunch the kernel without allocating new + * resources. NULL-safe. + */ +OMPI_DECLSPEC void ompi_op_gpu_session_end(ompi_op_gpu_session_t *session); + +/** + * Initialize the cmd_queue pool. Must be called once before any session + * operations (from ompi_op_base_open via the framework open hook). + */ +OMPI_DECLSPEC void ompi_op_gpu_session_pool_init(void); + +/** + * Drain and permanently destroy all pooled cmd_queues. Must be called once + * during MPI finalization (from ompi_op_base_close). + */ +OMPI_DECLSPEC void ompi_op_gpu_session_pool_finalize(void); + +END_C_DECLS + +#endif /* OMPI_OP_GPU_SESSION_H */ diff --git a/opal/mca/accelerator/base/accelerator_base_frame.c b/opal/mca/accelerator/base/accelerator_base_frame.c index 55e13f3773b..bbcfc84b37c 100644 --- a/opal/mca/accelerator/base/accelerator_base_frame.c +++ b/opal/mca/accelerator/base/accelerator_base_frame.c @@ -18,6 +18,9 @@ #include "opal/mca/accelerator/base/base.h" #include "opal/mca/base/base.h" #include "opal/mca/mca.h" +#include "opal/mca/allocator/allocator.h" +#include "opal/mca/allocator/basic/allocator_basic.h" +#include "opal/mca/threads/mutex.h" /* * The following file was created by configure. It contains extern @@ -30,6 +33,134 @@ opal_accelerator_base_module_t opal_accelerator = {0}; opal_accelerator_base_component_t opal_accelerator_base_selected_component = {{0}}; +/* Per-device allocator pool — allocated lazily to num_devices on first use. */ +static mca_allocator_base_module_t **opal_accel_device_allocators = NULL; +static int opal_accel_num_devices = 0; +static opal_mutex_t opal_accel_alloc_lock = OPAL_MUTEX_STATIC_INIT; + +/* + * Tracks a single GPU segment returned by opal_accelerator.mem_alloc so it + * can be released on cleanup. The basic allocator never calls seg_free during + * normal operation (only compact/finalize would, and compact is a no-op), so + * we keep our own list instead of relying on it. + */ +struct opal_accel_alloc_seg_t { + opal_list_item_t super; + void *ptr; +}; +typedef struct opal_accel_alloc_seg_t opal_accel_alloc_seg_t; +OBJ_CLASS_INSTANCE(opal_accel_alloc_seg_t, opal_list_item_t, NULL, NULL); + +typedef struct { + int dev_id; + opal_list_t segs; /* every GPU segment allocated via seg_alloc */ +} opal_accel_alloc_ctx_t; + +/* + * seg_alloc is called (under the basic allocator's internal lock) whenever the + * free list has no block large enough. Record each new GPU segment so it can + * be released on cleanup. + */ +static void *opal_accel_seg_alloc(void *ctx, size_t *size) +{ + opal_accel_alloc_ctx_t *ac = (opal_accel_alloc_ctx_t *) ctx; + opal_accel_alloc_seg_t *seg; + void *ptr = NULL; + + if (OPAL_SUCCESS != opal_accelerator.mem_alloc(ac->dev_id, &ptr, *size)) { + return NULL; + } + + seg = OBJ_NEW(opal_accel_alloc_seg_t); + if (OPAL_LIKELY(NULL != seg)) { + seg->ptr = ptr; + opal_list_append(&ac->segs, &seg->super); + } + return ptr; +} + +/* seg_free is wired into the allocator API but never invoked during normal + * operation (basic allocator compact is a no-op). Cleanup is handled + * explicitly in opal_accelerator_base_frame_close via the segs list. */ +static void opal_accel_seg_free(void *ctx, void *seg) +{ + (void) ctx; + (void) seg; +} + +mca_allocator_base_module_t * +opal_accelerator_base_get_device_allocator(int dev_id) +{ + mca_allocator_base_module_t *alloc; + opal_accel_alloc_ctx_t *ctx; + + if (dev_id < 0 || NULL == opal_accelerator.mem_alloc) { + return NULL; + } + + /* Fast path: array already sized and slot already filled. */ + if (NULL != opal_accel_device_allocators + && dev_id < opal_accel_num_devices + && NULL != opal_accel_device_allocators[dev_id]) { + return opal_accel_device_allocators[dev_id]; + } + + OPAL_THREAD_LOCK(&opal_accel_alloc_lock); + + /* Lazily allocate the per-device array on first call. */ + if (NULL == opal_accel_device_allocators) { + int num_devices = 0; + if (OPAL_SUCCESS != opal_accelerator.num_devices(&num_devices) || num_devices <= 0) { + OPAL_THREAD_UNLOCK(&opal_accel_alloc_lock); + return NULL; + } + opal_accel_device_allocators = calloc(num_devices, + sizeof(*opal_accel_device_allocators)); + if (NULL == opal_accel_device_allocators) { + OPAL_THREAD_UNLOCK(&opal_accel_alloc_lock); + return NULL; + } + opal_accel_num_devices = num_devices; + } + + if (dev_id >= opal_accel_num_devices) { + OPAL_THREAD_UNLOCK(&opal_accel_alloc_lock); + return NULL; + } + + if (NULL == opal_accel_device_allocators[dev_id]) { + ctx = (opal_accel_alloc_ctx_t *) malloc(sizeof(*ctx)); + if (NULL == ctx) { + OPAL_THREAD_UNLOCK(&opal_accel_alloc_lock); + return NULL; + } + ctx->dev_id = dev_id; + OBJ_CONSTRUCT(&ctx->segs, opal_list_t); + /* + * Use the basic (first-fit + coalescing) allocator rather than the + * bucket allocator. When a large block is freed it can be split to + * serve a smaller future request, and adjacent free blocks are merged + * back together, giving good reuse across the varying scratch-buffer + * sizes produced by collective algorithms. GPU segments are retained + * in the free list for the lifetime of the process; the GPU driver + * reclaims device memory on context teardown. + */ + alloc = mca_allocator_basic_component_init(true, + opal_accel_seg_alloc, + opal_accel_seg_free, + ctx); + if (NULL == alloc) { + free(ctx); + OPAL_THREAD_UNLOCK(&opal_accel_alloc_lock); + return NULL; + } + opal_accel_device_allocators[dev_id] = alloc; + } + + OPAL_THREAD_UNLOCK(&opal_accel_alloc_lock); + return opal_accel_device_allocators[dev_id]; +} + static int opal_accelerator_base_frame_register(mca_base_register_flag_t flags) { return OPAL_SUCCESS; @@ -37,6 +168,31 @@ static int opal_accelerator_base_frame_register(mca_base_register_flag_t flags) static int opal_accelerator_base_frame_close(void) { + if (NULL != opal_accel_device_allocators) { + for (int i = 0; i < opal_accel_num_devices; i++) { + if (NULL != opal_accel_device_allocators[i]) { + opal_accel_alloc_ctx_t *ctx = + (opal_accel_alloc_ctx_t *) opal_accel_device_allocators[i]->alc_context; + opal_accel_alloc_seg_t *seg; + + /* Release all GPU segments tracked in seg_alloc before the + * basic allocator frees its internal structures. */ + while (NULL != (seg = (opal_accel_alloc_seg_t *) + opal_list_remove_first(&ctx->segs))) { + opal_accelerator.mem_release(ctx->dev_id, seg->ptr); + OBJ_RELEASE(seg); + } + OBJ_DESTRUCT(&ctx->segs); + + opal_accel_device_allocators[i]->alc_finalize(opal_accel_device_allocators[i]); + free(ctx); + opal_accel_device_allocators[i] = NULL; + } + } + free(opal_accel_device_allocators); + opal_accel_device_allocators = NULL; + opal_accel_num_devices = 0; + } return mca_base_framework_components_close(&opal_accelerator_base_framework, NULL); } diff --git a/opal/mca/accelerator/base/base.h b/opal/mca/accelerator/base/base.h index e5922032ea8..94892d15c90 100644 --- a/opal/mca/accelerator/base/base.h +++ b/opal/mca/accelerator/base/base.h @@ -20,6 +20,7 @@ #include "opal/mca/accelerator/accelerator.h" #include "opal/mca/base/mca_base_framework.h" #include "opal/mca/mca.h" +#include "opal/mca/allocator/allocator.h" BEGIN_C_DECLS @@ -33,6 +34,14 @@ OPAL_DECLSPEC int opal_accelerator_base_select(void); OPAL_DECLSPEC extern opal_accelerator_base_component_t opal_accelerator_base_selected_component; +/** + * Return a pooled allocator for device memory on the given device. + * Created lazily and cached for the lifetime of the process. + * Returns NULL if no accelerator is available or dev_id is invalid. + */ +OPAL_DECLSPEC mca_allocator_base_module_t * +opal_accelerator_base_get_device_allocator(int dev_id); + END_C_DECLS #endif