diff --git a/config/opal_mca.m4 b/config/opal_mca.m4
index bb51d3bc5f1..7c597166289 100644
--- a/config/opal_mca.m4
+++ b/config/opal_mca.m4
@@ -186,7 +186,7 @@ of type-component pairs.  For example, --enable-mca-no-build=pml-ob1])
     else
        msg=
        if test -z "$enable_mca_dso"; then
-           enable_mca_dso="accelerator-cuda,accelerator-rocm,accelerator-ze"
+           enable_mca_dso="accelerator-cuda,accelerator-rocm,accelerator-ze,op-cuda,op-rocm"
            msg="(default)"
        fi
        DSO_all=0
diff --git a/ompi/mca/coll/acoll/coll_acoll_allreduce.c b/ompi/mca/coll/acoll/coll_acoll_allreduce.c
index 3b40fef39f9..6da452ec719 100644
--- a/ompi/mca/coll/acoll/coll_acoll_allreduce.c
+++ b/ompi/mca/coll/acoll/coll_acoll_allreduce.c
@@ -483,7 +483,7 @@ int mca_coll_acoll_allreduce_intra(const void *sbuf, void *rbuf, size_t count,
     /* Falling back to recursivedoubling for non-commutative operators to be safe */
     if (!ompi_op_is_commute(op)) {
         return ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf, count, dtype, op, comm,
-                                                                module);
+                                                                module, NULL);
     }
 
     /* Obtain the subcomms structure */
@@ -497,7 +497,7 @@ int mca_coll_acoll_allreduce_intra(const void *sbuf, void *rbuf, size_t count,
     /* Fallback to knomial if subc is not obtained */
     if (NULL == subc) {
         return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype, op, comm,
-                                                                module);
+                                                                module, NULL);
     }
     if (!subc->initialized) {
         err = mca_coll_acoll_comm_split_init(comm, acoll_module, subc, 0);
@@ -513,7 +513,7 @@ int mca_coll_acoll_allreduce_intra(const void *sbuf, void *rbuf, size_t count,
     if (num_nodes > 1) {
         if (total_dsize > 16384) {
             return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype, op,
-                                                                    comm, module);
+                                                                    comm, module, NULL);
         }
         int use_socket = acoll_module->use_socket != -1 ? acoll_module->use_socket : 0;
         coll_acoll_subcomms_t *soc_subc = NULL;
@@ -525,7 +525,7 @@ int mca_coll_acoll_allreduce_intra(const void *sbuf, void *rbuf, size_t count,
         /* Validate communicator hierarchy before proceeding */
         if (NULL == soc_comm || NULL == ldr_comm) {
             return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype, op,
-                                                                    comm, module);
+                                                                    comm, module, NULL);
         }
 
         err = check_and_create_subc(soc_comm, acoll_module, &soc_subc);
@@ -573,10 +573,10 @@ int mca_coll_acoll_allreduce_intra(const void *sbuf, void *rbuf, size_t count,
             if (ompi_comm_size(ldr_comm) > 1 && -1 != ldr_root) {
                 if ((MPI_IN_PLACE == sbuf)) {
                     err = ompi_coll_base_allreduce_intra_recursivedoubling(MPI_IN_PLACE, rbuf, count, dtype, op,
-                                                                           ldr_comm, module);
+                                                                           ldr_comm, module, NULL);
                 } else {
                     err = ompi_coll_base_allreduce_intra_recursivedoubling(tmp_sbuf, rbuf, count, dtype, op,
-                                                                           ldr_comm, module);
+                                                                           ldr_comm, module, NULL);
                 }
                 if (MPI_SUCCESS != err) {
                     if (NULL != inplacebuf_free) {
@@ -607,23 +607,23 @@ int mca_coll_acoll_allreduce_intra(const void *sbuf, void *rbuf, size_t count,
     if (1 == num_nodes) {
         if (total_dsize < 32) {
             return ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf, count, dtype, op,
-                                                                    comm, module);
+                                                                    comm, module, NULL);
         } else if ((total_dsize < 512) && is_opt) {
             return mca_coll_acoll_allreduce_small_msgs_h(sbuf, rbuf, count, dtype, op, comm, module,
                                                          subc, 1);
         } else if (total_dsize <= 2048) {
             return ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf, count, dtype, op,
-                                                                    comm, module);
+                                                                    comm, module, NULL);
         } else if (total_dsize < 65536) {
             if (1 == alg) {
                 return ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf, count, dtype,
-                                                                        op, comm, module);
+                                                                        op, comm, module, NULL);
             } else if (2 == alg) {
                 return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype,
-                                                                        op, comm, module);
+                                                                        op, comm, module, NULL);
             } else { /*3 == alg */
                 return ompi_coll_base_allreduce_intra_ring_segmented(sbuf, rbuf, count, dtype, op,
-                                                                     comm, module, 0);
+                                                                     comm, module, 0, NULL);
             }
         } else if (total_dsize < 4194304) {
             if (((0 != subc->smsc_use_sr_buf) || (subc->smsc_buf_size > 2 * total_dsize))
@@ -631,7 +631,7 @@ int mca_coll_acoll_allreduce_intra(const void *sbuf, void *rbuf, size_t count,
                 return mca_coll_acoll_allreduce_smsc_f(sbuf, rbuf, count, dtype, op, comm, module, subc);
             } else {
                 return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype,
-                                                                        op, comm, module);
+                                                                        op, comm, module, NULL);
             }
         } else if (total_dsize <= 16777216) {
             if (((0 != subc->smsc_use_sr_buf) || (subc->smsc_buf_size > 2 * total_dsize))
@@ -640,7 +640,7 @@ int mca_coll_acoll_allreduce_intra(const void *sbuf, void *rbuf, size_t count,
                 return mca_coll_acoll_bcast(rbuf, count, dtype, 0, comm, module);
             } else {
                 return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype,
-                                                                        op, comm, module);
+                                                                        op, comm, module, NULL);
             }
         } else {
             if (((0 != subc->smsc_use_sr_buf) || (subc->smsc_buf_size > 2 * total_dsize))
@@ -648,13 +648,13 @@ int mca_coll_acoll_allreduce_intra(const void *sbuf, void *rbuf, size_t count,
                 return mca_coll_acoll_allreduce_smsc_f(sbuf, rbuf, count, dtype, op, comm, module, subc);
             } else {
                 return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype,
-                                                                        op, comm, module);
+                                                                        op, comm, module, NULL);
             }
         }
 
     } else {
         return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype, op, comm,
-                                                                module);
+                                                                module, NULL);
     }
     return MPI_SUCCESS;
 }
diff --git a/ompi/mca/coll/acoll/coll_acoll_reduce.c b/ompi/mca/coll/acoll/coll_acoll_reduce.c
index 28fc3c62c6a..8fee8f7c2a2 100644
--- a/ompi/mca/coll/acoll/coll_acoll_reduce.c
+++ b/ompi/mca/coll/acoll/coll_acoll_reduce.c
@@ -360,11 +360,11 @@ int mca_coll_acoll_reduce_intra(const void *sbuf, void *rbuf, size_t count,
     /* Falling back to inorder binary for non-commutative operators to be safe */
     if (!ompi_op_is_commute(op)) {
         return ompi_coll_base_reduce_intra_in_order_binary(sbuf, rbuf, count, dtype, op, root, comm,
-                                                           module, 0, 0);
+                                                           module, 0, 0, NULL);
     }
     if (0 != root) { // ToDo: support non-zero root
         return ompi_coll_base_reduce_intra_binomial(sbuf, rbuf, count, dtype, op, root, comm,
-                                                    module, 0, 0);
+                                                    module, 0, 0, NULL);
     }
 
     /* Disable shm/xpmem based optimizations if: */
@@ -396,7 +396,7 @@ int mca_coll_acoll_reduce_intra(const void *sbuf, void *rbuf, size_t count,
     /* Fallback to knomial if subc is not obtained */
     if (NULL == subc) {
         return ompi_coll_base_reduce_intra_binomial(sbuf, rbuf, count, dtype, op, root, comm,
-                                                    module, 0, 0);
+                                                    module, 0, 0, NULL);
     }
 
     if (!subc->initialized || (root != subc->prev_init_root)) {
@@ -422,10 +422,10 @@ int mca_coll_acoll_reduce_intra(const void *sbuf, void *rbuf, size_t count,
                                                                 comm, module);
             } else if (2 == alg) {
                 return ompi_coll_base_reduce_intra_binomial(sbuf, rbuf, count, dtype, op, root,
-                                                            comm, module, 0, 0);
+                                                            comm, module, 0, 0, NULL);
             } else { /* either 3 == alg or acoll_module->red_algo is not 0, 1, 2*/
                 return ompi_coll_base_reduce_intra_in_order_binary(sbuf, rbuf, count, dtype, op,
-                                                                   root, comm, module, 0, 0);
+                                                                   root, comm, module, 0, 0, NULL);
             }
         } else {
             if ((((0 != subc->smsc_use_sr_buf)
@@ -437,7 +437,7 @@ int mca_coll_acoll_reduce_intra(const void *sbuf, void *rbuf, size_t count,
                                                    module, subc);
             } else {
                 return ompi_coll_base_reduce_intra_binomial(sbuf, rbuf, count, dtype, op,
-                                                                   root, comm, module, 0, 0);
+                                                                   root, comm, module, 0, 0, NULL);
             }
         }
     } else {
@@ -446,7 +446,7 @@ int mca_coll_acoll_reduce_intra(const void *sbuf, void *rbuf, size_t count,
                                           subc);
         } else {
             return ompi_coll_base_reduce_intra_binomial(sbuf, rbuf, count, dtype, op, root, comm,
-                                                        module, 0, 0);
+                                                        module, 0, 0, NULL);
         }
     }
     return MPI_SUCCESS;
diff --git a/ompi/mca/coll/acoll/coll_acoll_utils.h b/ompi/mca/coll/acoll/coll_acoll_utils.h
index 41d02381b5f..1fc9af3a931 100644
--- a/ompi/mca/coll/acoll/coll_acoll_utils.h
+++ b/ompi/mca/coll/acoll/coll_acoll_utils.h
@@ -33,6 +33,19 @@ extern int mca_coll_acoll_without_smsc;
 extern int mca_coll_acoll_smsc_use_sr_buf;
 extern int mca_coll_acoll_barrier_algo;
 
+/* Wrapper so recursivedoubling can be stored as a module function pointer
+ * despite having gained an allocator parameter in coll_base_functions.h. */
+static int
+ompi_coll_acoll_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf, size_t count,
+                                                  struct ompi_datatype_t *dtype,
+                                                  struct ompi_op_t *op,
+                                                  struct ompi_communicator_t *comm,
+                                                  mca_coll_base_module_t *module)
+{
+    return ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf, count, dtype, op,
+                                                            comm, module, NULL);
+}
+
 /*
  * Hybrid backoff spin-wait with adaptive progress calls.
  * Optimized for intra-node shared memory synchronization.
@@ -440,7 +453,7 @@ static inline int mca_coll_acoll_comm_split_init(ompi_communicator_t *comm,
     int rank = ompi_comm_rank(comm);
 
     (comm)->c_coll->coll_allgather = ompi_coll_base_allgather_intra_ring;
-    (comm)->c_coll->coll_allreduce = ompi_coll_base_allreduce_intra_recursivedoubling;
+    (comm)->c_coll->coll_allreduce = ompi_coll_acoll_allreduce_intra_recursivedoubling;
     (comm)->c_coll->coll_bcast = ompi_coll_base_bcast_intra_basic_linear;
     if (!subc->initialized) {
         OBJ_CONSTRUCT(&comm_info, opal_info_t);
@@ -538,14 +551,14 @@ static inline int mca_coll_acoll_comm_split_init(ompi_communicator_t *comm,
         coll_bcast_loc = (subc->local_comm)->c_coll->coll_bcast;
         (subc->local_comm)->c_coll->coll_allgather = ompi_coll_base_allgather_intra_ring;
         (subc->local_comm)->c_coll->coll_allreduce
-            = ompi_coll_base_allreduce_intra_recursivedoubling;
+            = ompi_coll_acoll_allreduce_intra_recursivedoubling;
         (subc->local_comm)->c_coll->coll_bcast = ompi_coll_base_bcast_intra_basic_linear;
         coll_allreduce_soc = (subc->socket_comm)->c_coll->coll_allreduce;
         coll_allgather_soc = (subc->socket_comm)->c_coll->coll_allgather;
         coll_bcast_soc = (subc->socket_comm)->c_coll->coll_bcast;
         (subc->socket_comm)->c_coll->coll_allgather = ompi_coll_base_allgather_intra_ring;
         (subc->socket_comm)->c_coll->coll_allreduce
-            = ompi_coll_base_allreduce_intra_recursivedoubling;
+            = ompi_coll_acoll_allreduce_intra_recursivedoubling;
         (subc->socket_comm)->c_coll->coll_bcast = ompi_coll_base_bcast_intra_basic_linear;
     }
 
diff --git a/ompi/mca/coll/base/coll_base_allgather.c b/ompi/mca/coll/base/coll_base_allgather.c
index 6d9bd6fcfc3..a9480f09169 100644
--- a/ompi/mca/coll/base/coll_base_allgather.c
+++ b/ompi/mca/coll/base/coll_base_allgather.c
@@ -114,7 +114,7 @@ ompi_coll_base_allgather_intra_recursivedoubling(const void *sbuf, size_t scount
         int k = 2;
         return ompi_coll_base_allgather_intra_k_bruck(sbuf, scount, sdtype,
                                                       rbuf, rcount, rdtype,
-                                                      comm, module, k);
+                                                      comm, module, k, NULL);
     }
 
     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
@@ -771,7 +771,8 @@ int ompi_coll_base_allgather_intra_k_bruck(const void *sbuf, size_t scount,
                                           struct ompi_datatype_t *rdtype,
                                           struct ompi_communicator_t *comm,
                                           mca_coll_base_module_t *module,
-                                          int radix)
+                                          int radix,
+                                          mca_allocator_base_module_t *allocator)
 {
     int line = -1, rank, size, dst, src, err = MPI_SUCCESS;
     int recvcount, distance;
@@ -796,7 +797,7 @@ int ompi_coll_base_allgather_intra_k_bruck(const void *sbuf, size_t scount,
     if (0 != rank) {
         /* Compute the temporary buffer size, including datatypes empty gaps */
         rsize = opal_datatype_span(&rdtype->super, (size_t)rcount * (size - rank), &rgap);
-        tmp_buf = (char *) malloc(rsize);
+        tmp_buf = (char *) COLL_BASE_ALLOC(allocator, rsize);
         tmp_buf_start = tmp_buf - rgap;
     }
 
@@ -891,7 +892,7 @@ int ompi_coll_base_allgather_intra_k_bruck(const void *sbuf, size_t scount,
         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
     }
 
-    if(tmp_buf != NULL) free(tmp_buf);
+    if(tmp_buf != NULL) COLL_BASE_FREE(allocator, tmp_buf);
     return MPI_SUCCESS;
 
 err_hndl:
@@ -911,7 +912,7 @@ int ompi_coll_base_allgather_intra_k_bruck(const void *sbuf, size_t scount,
     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,  "%s:%4d\tError occurred %d, rank %2d",
                  __FILE__, line, err, rank));
     if(tmp_buf != NULL) {
-        free(tmp_buf);
+        COLL_BASE_FREE(allocator, tmp_buf);
         tmp_buf = NULL;
         tmp_buf_start = NULL;
     }
diff --git a/ompi/mca/coll/base/coll_base_allreduce.c b/ompi/mca/coll/base/coll_base_allreduce.c
index 82d57328b2e..1d605258102 100644
--- a/ompi/mca/coll/base/coll_base_allreduce.c
+++ b/ompi/mca/coll/base/coll_base_allreduce.c
@@ -40,6 +40,7 @@
 #include "ompi/mca/pml/pml.h"
 #include "ompi/op/op.h"
 #include "ompi/mca/coll/base/coll_base_functions.h"
+#include "ompi/op/op_gpu_session.h"
 #include "coll_base_topo.h"
 #include "coll_base_util.h"
 
@@ -136,7 +137,8 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
                                                   struct ompi_datatype_t *dtype,
                                                   struct ompi_op_t *op,
                                                   struct ompi_communicator_t *comm,
-                                                  mca_coll_base_module_t *module)
+                                                  mca_coll_base_module_t *module,
+                                                  ompi_op_gpu_session_t *session)
 {
     int ret, line, rank, size, adjsize, remote, distance;
     int newrank, newremote, extra_ranks;
@@ -160,7 +162,7 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
 
     /* Allocate and initialize temporary send buffer */
     span = opal_datatype_span(&dtype->super, count, &gap);
-    inplacebuf_free = (char*) malloc(span);
+    inplacebuf_free = (char*) COLL_SESSION_ALLOC(session, span);
     if (NULL == inplacebuf_free) { ret = -1; line = __LINE__; goto error_hndl; }
     inplacebuf = inplacebuf_free - gap;
 
@@ -200,7 +202,7 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
                                     MPI_STATUS_IGNORE));
             if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
             /* tmpsend = tmprecv (op) tmpsend */
-            ompi_op_reduce(op, tmprecv, tmpsend, count, dtype);
+            COLL_BASE_REDUCE(session, op, tmprecv, tmpsend, count, dtype);
             newrank = rank >> 1;
         }
     } else {
@@ -230,13 +232,13 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
         /* Apply operation */
         if (rank < remote) {
             /* tmprecv = tmpsend (op) tmprecv */
-            ompi_op_reduce(op, tmpsend, tmprecv, count, dtype);
+            COLL_BASE_REDUCE(session, op, tmpsend, tmprecv, count, dtype);
             tmpswap = tmprecv;
             tmprecv = tmpsend;
             tmpsend = tmpswap;
         } else {
             /* tmpsend = tmprecv (op) tmpsend */
-            ompi_op_reduce(op, tmprecv, tmpsend, count, dtype);
+            COLL_BASE_REDUCE(session, op, tmprecv, tmpsend, count, dtype);
         }
     }
 
@@ -266,14 +268,14 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
         if (ret < 0) { line = __LINE__; goto error_hndl; }
     }
 
-    if (NULL != inplacebuf_free) free(inplacebuf_free);
+    COLL_SESSION_FREE(session, inplacebuf_free);
     return MPI_SUCCESS;
 
  error_hndl:
     OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tRank %d Error occurred %d\n",
                  __FILE__, line, rank, ret));
     (void)line;  // silence compiler warning
-    if (NULL != inplacebuf_free) free(inplacebuf_free);
+    COLL_SESSION_FREE(session, inplacebuf_free);
     return ret;
 }
 
@@ -346,7 +348,8 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, size_t count,
                                      struct ompi_datatype_t *dtype,
                                      struct ompi_op_t *op,
                                      struct ompi_communicator_t *comm,
-                                     mca_coll_base_module_t *module)
+                                     mca_coll_base_module_t *module,
+                                     ompi_op_gpu_session_t *session)
 {
     int ret, line, rank, size, k, recv_from, send_to, block_count, inbi;
     int early_segcount, late_segcount, split_rank, max_segcount;
@@ -377,7 +380,8 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, size_t count,
         return (ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf,
                                                                   count,
                                                                   dtype, op,
-                                                                  comm, module));
+                                                                  comm, module,
+                                                                  session));
     }
 
     /* Allocate and initialize temporary buffers */
@@ -401,10 +405,10 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, size_t count,
     max_real_segsize = true_extent + (max_segcount - 1) * extent;
 
 
-    inbuf[0] = (char*)malloc(max_real_segsize);
+    inbuf[0] = (char*)COLL_SESSION_ALLOC(session, max_real_segsize);
     if (NULL == inbuf[0]) { ret = -1; line = __LINE__; goto error_hndl; }
     if (size > 2) {
-        inbuf[1] = (char*)malloc(max_real_segsize);
+        inbuf[1] = (char*)COLL_SESSION_ALLOC(session, max_real_segsize);
         if (NULL == inbuf[1]) { ret = -1; line = __LINE__; goto error_hndl; }
     }
 
@@ -472,7 +476,7 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, size_t count,
                         ((ptrdiff_t)prevblock * late_segcount + split_rank));
         block_count = ((prevblock < split_rank)? early_segcount : late_segcount);
         tmprecv = ((char*)rbuf) + (ptrdiff_t)block_offset * extent;
-        ompi_op_reduce(op, inbuf[inbi ^ 0x1], tmprecv, block_count, dtype);
+        COLL_BASE_REDUCE(session, op, inbuf[inbi ^ 0x1], tmprecv, block_count, dtype);
 
         /* send previous block to send_to */
         ret = MCA_PML_CALL(send(tmprecv, block_count, dtype, send_to,
@@ -493,7 +497,7 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, size_t count,
                     ((ptrdiff_t)recv_from * late_segcount + split_rank));
     block_count = ((recv_from < split_rank)? early_segcount : late_segcount);
     tmprecv = ((char*)rbuf) + (ptrdiff_t)block_offset * extent;
-    ompi_op_reduce(op, inbuf[inbi], tmprecv, block_count, dtype);
+    COLL_BASE_REDUCE(session, op, inbuf[inbi], tmprecv, block_count, dtype);
 
     /* Distribution loop - variation of ring allgather */
     send_to = (rank + 1) % size;
@@ -524,8 +528,8 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, size_t count,
 
     }
 
-    if (NULL != inbuf[0]) free(inbuf[0]);
-    if (NULL != inbuf[1]) free(inbuf[1]);
+    COLL_SESSION_FREE(session, inbuf[0]);
+    COLL_SESSION_FREE(session, inbuf[1]);
 
     return MPI_SUCCESS;
 
@@ -534,8 +538,8 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, size_t count,
                  __FILE__, line, rank, ret));
     ompi_coll_base_free_reqs(reqs, 2);
     (void)line;  // silence compiler warning
-    if (NULL != inbuf[0]) free(inbuf[0]);
-    if (NULL != inbuf[1]) free(inbuf[1]);
+    COLL_SESSION_FREE(session, inbuf[0]);
+    COLL_SESSION_FREE(session, inbuf[1]);
     return ret;
 }
 
@@ -624,7 +628,8 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, size
                                                struct ompi_op_t *op,
                                                struct ompi_communicator_t *comm,
                                                mca_coll_base_module_t *module,
-                                               uint32_t segsize)
+                                               uint32_t segsize,
+                                               ompi_op_gpu_session_t *session)
 {
     int ret, line, rank, size, k, recv_from, send_to;
     int early_blockcount, late_blockcount, split_rank;
@@ -660,7 +665,7 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, size
         if (count < (size_t) (size * segcount)) {
             OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:allreduce_ring_segmented rank %d/%d, count %zu, switching to regular ring", rank, size, count));
             return (ompi_coll_base_allreduce_intra_ring(sbuf, rbuf, count, dtype, op,
-                                                         comm, module));
+                                                         comm, module, session));
         }
 
     /* Determine the number of phases of the algorithm */
@@ -689,10 +694,10 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, size
      max_real_segsize = opal_datatype_span(&dtype->super, max_segcount, &gap);
 
     /* Allocate and initialize temporary buffers */
-    inbuf[0] = (char*)malloc(max_real_segsize);
+    inbuf[0] = (char*)COLL_SESSION_ALLOC(session, max_real_segsize);
     if (NULL == inbuf[0]) { ret = -1; line = __LINE__; goto error_hndl; }
     if (size > 2) {
-        inbuf[1] = (char*)malloc(max_real_segsize);
+        inbuf[1] = (char*)COLL_SESSION_ALLOC(session, max_real_segsize);
         if (NULL == inbuf[1]) { ret = -1; line = __LINE__; goto error_hndl; }
     }
 
@@ -783,7 +788,7 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, size
                             ((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) :
                             ((ptrdiff_t)phase * (ptrdiff_t)late_phase_segcount + split_phase));
             tmprecv = ((char*)rbuf) + (ptrdiff_t)(block_offset + phase_offset) * extent;
-            ompi_op_reduce(op, inbuf[inbi ^ 0x1], tmprecv, phase_count, dtype);
+            COLL_BASE_REDUCE(session, op, inbuf[inbi ^ 0x1], tmprecv, phase_count, dtype);
 
             /* send previous block to send_to */
             ret = MCA_PML_CALL(send(tmprecv, phase_count, dtype, send_to,
@@ -812,7 +817,7 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, size
                         ((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) :
                         ((ptrdiff_t)phase * (ptrdiff_t)late_phase_segcount + split_phase));
         tmprecv = ((char*)rbuf) + (ptrdiff_t)(block_offset + phase_offset) * extent;
-        ompi_op_reduce(op, inbuf[inbi], tmprecv, phase_count, dtype);
+        COLL_BASE_REDUCE(session, op, inbuf[inbi], tmprecv, phase_count, dtype);
     }
 
     /* Distribution loop - variation of ring allgather */
@@ -844,8 +849,8 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, size
 
     }
 
-    if (NULL != inbuf[0]) free(inbuf[0]);
-    if (NULL != inbuf[1]) free(inbuf[1]);
+    COLL_SESSION_FREE(session, inbuf[0]);
+    COLL_SESSION_FREE(session, inbuf[1]);
 
     return MPI_SUCCESS;
 
@@ -854,8 +859,8 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, size
                  __FILE__, line, rank, ret));
     ompi_coll_base_free_reqs(reqs, 2);
     (void)line;  // silence compiler warning
-    if (NULL != inbuf[0]) free(inbuf[0]);
-    if (NULL != inbuf[1]) free(inbuf[1]);
+    COLL_SESSION_FREE(session, inbuf[0]);
+    COLL_SESSION_FREE(session, inbuf[1]);
     return ret;
 }
 
@@ -974,7 +979,7 @@ ompi_coll_base_allreduce_intra_basic_linear(const void *sbuf, void *rbuf, size_t
 int ompi_coll_base_allreduce_intra_redscat_allgather(
     const void *sbuf, void *rbuf, size_t count, struct ompi_datatype_t *dtype,
     struct ompi_op_t *op, struct ompi_communicator_t *comm,
-    mca_coll_base_module_t *module)
+    mca_coll_base_module_t *module, ompi_op_gpu_session_t *session)
 {
     int *rindex = NULL, *rcount = NULL, *sindex = NULL, *scount = NULL;
 
@@ -1006,7 +1011,7 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
 
     /* Temporary buffer for receiving messages */
     char *tmp_buf = NULL;
-    char *tmp_buf_raw = (char *)malloc(dsize);
+    char *tmp_buf_raw = (char *)COLL_SESSION_ALLOC(session, dsize);
     if (NULL == tmp_buf_raw)
         return OMPI_ERR_OUT_OF_RESOURCE;
     tmp_buf = tmp_buf_raw - gap;
@@ -1056,8 +1061,8 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
             if (MPI_SUCCESS != err) { goto cleanup_and_return; }
 
             /* Reduce on the right half of the buffers (result in rbuf) */
-            ompi_op_reduce(op, (char *)tmp_buf + (ptrdiff_t)count_lhalf * extent,
-                           (char *)rbuf + count_lhalf * extent, count_rhalf, dtype);
+            COLL_BASE_REDUCE(session, op, (char *)tmp_buf + (ptrdiff_t)count_lhalf * extent,
+                             (char *)rbuf + count_lhalf * extent, count_rhalf, dtype);
 
             /* Send the right half to the left neighbor */
             err = MCA_PML_CALL(send((char *)rbuf + (ptrdiff_t)count_lhalf * extent,
@@ -1084,7 +1089,7 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
             if (MPI_SUCCESS != err) { goto cleanup_and_return; }
 
             /* Reduce on the right half of the buffers (result in rbuf) */
-            ompi_op_reduce(op, tmp_buf, rbuf, count_lhalf, dtype);
+            COLL_BASE_REDUCE(session, op, tmp_buf, rbuf, count_lhalf, dtype);
 
             /* Recv the right half from the right neighbor */
             err = MCA_PML_CALL(recv((char *)rbuf + (ptrdiff_t)count_lhalf * extent,
@@ -1165,9 +1170,9 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
             if (MPI_SUCCESS != err) { goto cleanup_and_return; }
 
             /* Local reduce: rbuf[] = tmp_buf[] <op> rbuf[] */
-            ompi_op_reduce(op, (char *)tmp_buf + (ptrdiff_t)rindex[step] * extent,
-                           (char *)rbuf + (ptrdiff_t)rindex[step] * extent,
-                           rcount[step], dtype);
+            COLL_BASE_REDUCE(session, op, (char *)tmp_buf + (ptrdiff_t)rindex[step] * extent,
+                             (char *)rbuf + (ptrdiff_t)rindex[step] * extent,
+                             rcount[step], dtype);
 
             /* Move the current window to the received message */
             if (step + 1 < nsteps) {
@@ -1234,8 +1239,7 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
     }
 
   cleanup_and_return:
-    if (NULL != tmp_buf_raw)
-        free(tmp_buf_raw);
+    COLL_SESSION_FREE(session, tmp_buf_raw);
     if (NULL != rindex)
         free(rindex);
     if (NULL != sindex)
@@ -1268,7 +1272,8 @@ int ompi_coll_base_allreduce_intra_allgather_reduce(const void *sbuf, void *rbuf
                                                     struct ompi_datatype_t *dtype,
                                                     struct ompi_op_t *op,
                                                     struct ompi_communicator_t *comm,
-                                                    mca_coll_base_module_t *module)
+                                                    mca_coll_base_module_t *module,
+                                                    ompi_op_gpu_session_t *session)
 {
     int line = -1;
     char *partial_buf = NULL;
@@ -1289,10 +1294,10 @@ int ompi_coll_base_allreduce_intra_allgather_reduce(const void *sbuf, void *rbuf
     }
     ptrdiff_t buf_size, gap = 0;
     buf_size = opal_datatype_span(&dtype->super, (int64_t)count * size, &gap);
-    partial_buf = (char *) malloc(buf_size);
+    partial_buf = (char *) COLL_SESSION_ALLOC(session, buf_size);
     partial_buf_start = partial_buf - gap;
     buf_size = opal_datatype_span(&dtype->super, (int64_t)count, &gap);
-    tmpsend = (char *) malloc(buf_size);
+    tmpsend = (char *) COLL_SESSION_ALLOC(session, buf_size);
     tmpsend_start = tmpsend - gap;
 
     err = ompi_datatype_copy_content_same_ddt(dtype, count,
@@ -1307,11 +1312,11 @@ int ompi_coll_base_allreduce_intra_allgather_reduce(const void *sbuf, void *rbuf
     if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 
     for (int target = 1; target < size; target++) {
-        ompi_op_reduce(op,
-                       partial_buf_start + (ptrdiff_t)target * count * extent,
-                       partial_buf_start,
-                       count,
-                       dtype);
+        COLL_BASE_REDUCE(session, op,
+                         partial_buf_start + (ptrdiff_t)target * count * extent,
+                         partial_buf_start,
+                         count,
+                         dtype);
     }
 
     // move data to rbuf
@@ -1320,18 +1325,18 @@ int ompi_coll_base_allreduce_intra_allgather_reduce(const void *sbuf, void *rbuf
                                               (char*)partial_buf_start);
     if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 
-    if (NULL != partial_buf) free(partial_buf);
-    if (NULL != tmpsend) free(tmpsend);
+    COLL_SESSION_FREE(session, partial_buf);
+    COLL_SESSION_FREE(session, tmpsend);
     return MPI_SUCCESS;
 
 err_hndl:
     if (NULL != partial_buf) {
-        free(partial_buf);
+        COLL_SESSION_FREE(session, partial_buf);
         partial_buf = NULL;
         partial_buf_start = NULL;
     }
      if (NULL != tmpsend) {
-        free(tmpsend);
+        COLL_SESSION_FREE(session, tmpsend);
         tmpsend = NULL;
         tmpsend_start = NULL;
     }
diff --git a/ompi/mca/coll/base/coll_base_exscan.c b/ompi/mca/coll/base/coll_base_exscan.c
index d702eb361b9..d49b7912f7c 100644
--- a/ompi/mca/coll/base/coll_base_exscan.c
+++ b/ompi/mca/coll/base/coll_base_exscan.c
@@ -23,6 +23,7 @@
 #include "ompi/mca/coll/base/coll_base_util.h"
 #include "ompi/mca/pml/pml.h"
 #include "ompi/op/op.h"
+#include "ompi/op/op_gpu_session.h"
 
 /*
  * ompi_coll_base_exscan_intra_linear
@@ -142,7 +143,7 @@ ompi_coll_base_exscan_intra_linear(const void *sbuf, void *rbuf, size_t count,
 int ompi_coll_base_exscan_intra_recursivedoubling(
     const void *sendbuf, void *recvbuf, size_t count, struct ompi_datatype_t *datatype,
     struct ompi_op_t *op, struct ompi_communicator_t *comm,
-    mca_coll_base_module_t *module)
+    mca_coll_base_module_t *module, ompi_op_gpu_session_t *session)
 {
     int err = MPI_SUCCESS;
     char *tmpsend_raw = NULL, *tmprecv_raw = NULL;
@@ -158,8 +159,8 @@ int ompi_coll_base_exscan_intra_recursivedoubling(
 
     ptrdiff_t dsize, gap;
     dsize = opal_datatype_span(&datatype->super, count, &gap);
-    tmpsend_raw = malloc(dsize);
-    tmprecv_raw = malloc(dsize);
+    tmpsend_raw = COLL_SESSION_ALLOC(session, dsize);
+    tmprecv_raw = COLL_SESSION_ALLOC(session, dsize);
     if (NULL == tmpsend_raw || NULL == tmprecv_raw) {
         err = OMPI_ERR_OUT_OF_RESOURCE;
         goto cleanup_and_return;
@@ -195,17 +196,17 @@ int ompi_coll_base_exscan_intra_recursivedoubling(
                     is_first_block = 0;
                 } else {
                     /* Accumulate prefix reduction: recvbuf = precv <op> recvbuf */
-                    ompi_op_reduce(op, precv, recvbuf, count, datatype);
+                    COLL_BASE_REDUCE(session, op, precv, recvbuf, count, datatype);
                 }
                 /* Partial result: psend = precv <op> psend */
-                ompi_op_reduce(op, precv, psend, count, datatype);
+                COLL_BASE_REDUCE(session, op, precv, psend, count, datatype);
             } else {
                 if (is_commute) {
                     /* psend = precv <op> psend */
-                    ompi_op_reduce(op, precv, psend, count, datatype);
+                    COLL_BASE_REDUCE(session, op, precv, psend, count, datatype);
                 } else {
                     /* precv = psend <op> precv */
-                    ompi_op_reduce(op, psend, precv, count, datatype);
+                    COLL_BASE_REDUCE(session, op, psend, precv, count, datatype);
                     char *tmp = psend;
                     psend = precv;
                     precv = tmp;
@@ -215,9 +216,7 @@ int ompi_coll_base_exscan_intra_recursivedoubling(
     }
 
 cleanup_and_return:
-    if (NULL != tmpsend_raw)
-        free(tmpsend_raw);
-    if (NULL != tmprecv_raw)
-        free(tmprecv_raw);
+    COLL_SESSION_FREE(session, tmpsend_raw);
+    COLL_SESSION_FREE(session, tmprecv_raw);
     return err;
 }
diff --git a/ompi/mca/coll/base/coll_base_functions.h b/ompi/mca/coll/base/coll_base_functions.h
index b59ef55f437..b2ab8a16d57 100644
--- a/ompi/mca/coll/base/coll_base_functions.h
+++ b/ompi/mca/coll/base/coll_base_functions.h
@@ -36,6 +36,35 @@
 #include "ompi/mca/coll/coll.h"
 #include "ompi/info/info.h"
 #include "ompi/request/request.h"
+#include "opal/mca/allocator/allocator.h"
+#include "ompi/op/op_gpu_session.h"
+
+/* Allocator-aware helpers for Pattern-A scratch buffers.
+ * Pass allocator=NULL to fall back to plain malloc/free. */
+#define COLL_BASE_ALLOC(allocator, size) \
+    ((allocator) ? (allocator)->alc_alloc((allocator), (size), 0) : malloc(size))
+
+#define COLL_BASE_FREE(allocator, ptr) \
+    do { if (ptr) { if (allocator) (allocator)->alc_free((allocator), (ptr)); \
+                    else free(ptr); } } while (0)
+
+/* GPU session-aware helpers for reduction scratch buffers.
+ * When session is non-NULL, use session->allocator; otherwise fall back to
+ * plain malloc/free.  Pass session=NULL for non-GPU collectives. */
+#define COLL_BASE_REDUCE(session, op, src, dst, count, dtype)                  \
+    do {                                                                        \
+        if (NULL != (session))                                                  \
+            ompi_op_gpu_session_reduce((session), (src), (dst), (dst), (count)); \
+        else                                                                    \
+            ompi_op_reduce((op), (src), (dst), (count), (dtype));              \
+    } while (0)
+
+#define COLL_SESSION_ALLOC(session, size) \
+    ((session) ? COLL_BASE_ALLOC((session)->allocator, (size)) : malloc(size))
+
+#define COLL_SESSION_FREE(session, ptr) \
+    do { if (session) { COLL_BASE_FREE((session)->allocator, (ptr)); } \
+         else { if (ptr) free(ptr); } } while (0)
 
 /* need to include our own topo prototypes so we can malloc data on the comm correctly */
 #include "coll_base_topo.h"
@@ -193,7 +222,7 @@ int ompi_coll_base_allgather_intra_ring(ALLGATHER_ARGS);
 int ompi_coll_base_allgather_intra_neighborexchange(ALLGATHER_ARGS);
 int ompi_coll_base_allgather_intra_basic_linear(ALLGATHER_ARGS);
 int ompi_coll_base_allgather_intra_two_procs(ALLGATHER_ARGS);
-int ompi_coll_base_allgather_intra_k_bruck(ALLGATHER_ARGS, int radix);
+int ompi_coll_base_allgather_intra_k_bruck(ALLGATHER_ARGS, int radix, mca_allocator_base_module_t *allocator);
 int ompi_coll_base_allgather_direct_messaging(ALLGATHER_ARGS);
 
 /* All GatherV */
@@ -206,12 +235,12 @@ int ompi_coll_base_allgatherv_intra_two_procs(ALLGATHERV_ARGS);
 
 /* All Reduce */
 int ompi_coll_base_allreduce_intra_nonoverlapping(ALLREDUCE_ARGS);
-int ompi_coll_base_allreduce_intra_recursivedoubling(ALLREDUCE_ARGS);
-int ompi_coll_base_allreduce_intra_ring(ALLREDUCE_ARGS);
-int ompi_coll_base_allreduce_intra_ring_segmented(ALLREDUCE_ARGS, uint32_t segsize);
+int ompi_coll_base_allreduce_intra_recursivedoubling(ALLREDUCE_ARGS, ompi_op_gpu_session_t *session);
+int ompi_coll_base_allreduce_intra_ring(ALLREDUCE_ARGS, ompi_op_gpu_session_t *session);
+int ompi_coll_base_allreduce_intra_ring_segmented(ALLREDUCE_ARGS, uint32_t segsize, ompi_op_gpu_session_t *session);
 int ompi_coll_base_allreduce_intra_basic_linear(ALLREDUCE_ARGS);
-int ompi_coll_base_allreduce_intra_redscat_allgather(ALLREDUCE_ARGS);
-int ompi_coll_base_allreduce_intra_allgather_reduce(ALLREDUCE_ARGS);
+int ompi_coll_base_allreduce_intra_redscat_allgather(ALLREDUCE_ARGS, ompi_op_gpu_session_t *session);
+int ompi_coll_base_allreduce_intra_allgather_reduce(ALLREDUCE_ARGS, ompi_op_gpu_session_t *session);
 
 /* AlltoAll */
 int ompi_coll_base_alltoall_intra_pairwise(ALLTOALL_ARGS);
@@ -255,48 +284,46 @@ int ompi_coll_base_bcast_intra_scatter_allgather(BCAST_ARGS, uint32_t segsize);
 int ompi_coll_base_bcast_intra_scatter_allgather_ring(BCAST_ARGS, uint32_t segsize);
 
 /* Exscan */
-int ompi_coll_base_exscan_intra_recursivedoubling(EXSCAN_ARGS);
+int ompi_coll_base_exscan_intra_recursivedoubling(EXSCAN_ARGS, ompi_op_gpu_session_t *session);
 int ompi_coll_base_exscan_intra_linear(EXSCAN_ARGS);
-int ompi_coll_base_exscan_intra_recursivedoubling(EXSCAN_ARGS);
 
 /* Gather */
 int ompi_coll_base_gather_intra_basic_linear(GATHER_ARGS);
-int ompi_coll_base_gather_intra_binomial(GATHER_ARGS);
+int ompi_coll_base_gather_intra_binomial(GATHER_ARGS, mca_allocator_base_module_t *allocator);
 int ompi_coll_base_gather_intra_linear_sync(GATHER_ARGS, int first_segment_size);
 
 /* GatherV */
 
 /* Reduce */
-int ompi_coll_base_reduce_generic(REDUCE_ARGS, ompi_coll_tree_t* tree, size_t count_by_segment, int max_outstanding_reqs);
+int ompi_coll_base_reduce_generic(REDUCE_ARGS, ompi_coll_tree_t* tree, size_t count_by_segment, int max_outstanding_reqs, ompi_op_gpu_session_t *session);
 int ompi_coll_base_reduce_intra_basic_linear(REDUCE_ARGS);
-int ompi_coll_base_reduce_intra_chain(REDUCE_ARGS, uint32_t segsize, int fanout, int max_outstanding_reqs );
-int ompi_coll_base_reduce_intra_pipeline(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
-int ompi_coll_base_reduce_intra_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
-int ompi_coll_base_reduce_intra_binomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
-int ompi_coll_base_reduce_intra_in_order_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
-int ompi_coll_base_reduce_intra_redscat_gather(REDUCE_ARGS);
-int ompi_coll_base_reduce_intra_knomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs, int radix);
+int ompi_coll_base_reduce_intra_chain(REDUCE_ARGS, uint32_t segsize, int fanout, int max_outstanding_reqs, ompi_op_gpu_session_t *session);
+int ompi_coll_base_reduce_intra_pipeline(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs, ompi_op_gpu_session_t *session);
+int ompi_coll_base_reduce_intra_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs, ompi_op_gpu_session_t *session);
+int ompi_coll_base_reduce_intra_binomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs, ompi_op_gpu_session_t *session);
+int ompi_coll_base_reduce_intra_in_order_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs, ompi_op_gpu_session_t *session);
+int ompi_coll_base_reduce_intra_redscat_gather(REDUCE_ARGS, ompi_op_gpu_session_t *session);
+int ompi_coll_base_reduce_intra_knomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs, int radix, ompi_op_gpu_session_t *session);
 
 /* Reduce_scatter */
-int ompi_coll_base_reduce_scatter_intra_nonoverlapping(REDUCESCATTER_ARGS);
-int ompi_coll_base_reduce_scatter_intra_basic_recursivehalving(REDUCESCATTER_ARGS);
-int ompi_coll_base_reduce_scatter_intra_ring(REDUCESCATTER_ARGS);
-int ompi_coll_base_reduce_scatter_intra_butterfly(REDUCESCATTER_ARGS);
+int ompi_coll_base_reduce_scatter_intra_nonoverlapping(REDUCESCATTER_ARGS, ompi_op_gpu_session_t *session);
+int ompi_coll_base_reduce_scatter_intra_basic_recursivehalving(REDUCESCATTER_ARGS, ompi_op_gpu_session_t *session);
+int ompi_coll_base_reduce_scatter_intra_ring(REDUCESCATTER_ARGS, ompi_op_gpu_session_t *session);
+int ompi_coll_base_reduce_scatter_intra_butterfly(REDUCESCATTER_ARGS, ompi_op_gpu_session_t *session);
 
 /* Reduce_scatter_block */
-int ompi_coll_base_reduce_scatter_block_basic_linear(REDUCESCATTERBLOCK_ARGS);
-int ompi_coll_base_reduce_scatter_block_intra_recursivedoubling(REDUCESCATTERBLOCK_ARGS);
-int ompi_coll_base_reduce_scatter_block_intra_recursivehalving(REDUCESCATTERBLOCK_ARGS);
-int ompi_coll_base_reduce_scatter_block_intra_butterfly(REDUCESCATTERBLOCK_ARGS);
+int ompi_coll_base_reduce_scatter_block_basic_linear(REDUCESCATTERBLOCK_ARGS, ompi_op_gpu_session_t *session);
+int ompi_coll_base_reduce_scatter_block_intra_recursivedoubling(REDUCESCATTERBLOCK_ARGS, ompi_op_gpu_session_t *session);
+int ompi_coll_base_reduce_scatter_block_intra_recursivehalving(REDUCESCATTERBLOCK_ARGS, ompi_op_gpu_session_t *session);
+int ompi_coll_base_reduce_scatter_block_intra_butterfly(REDUCESCATTERBLOCK_ARGS, ompi_op_gpu_session_t *session);
 
 /* Scan */
-int ompi_coll_base_scan_intra_recursivedoubling(SCAN_ARGS);
+int ompi_coll_base_scan_intra_recursivedoubling(SCAN_ARGS, ompi_op_gpu_session_t *session);
 int ompi_coll_base_scan_intra_linear(SCAN_ARGS);
-int ompi_coll_base_scan_intra_recursivedoubling(SCAN_ARGS);
 
 /* Scatter */
 int ompi_coll_base_scatter_intra_basic_linear(SCATTER_ARGS);
-int ompi_coll_base_scatter_intra_binomial(SCATTER_ARGS);
+int ompi_coll_base_scatter_intra_binomial(SCATTER_ARGS, mca_allocator_base_module_t *allocator);
 int ompi_coll_base_scatter_intra_linear_nb(SCATTER_ARGS, int max_reqs);
 
 /* ScatterV */
diff --git a/ompi/mca/coll/base/coll_base_gather.c b/ompi/mca/coll/base/coll_base_gather.c
index a4486152157..894af0a085a 100644
--- a/ompi/mca/coll/base/coll_base_gather.c
+++ b/ompi/mca/coll/base/coll_base_gather.c
@@ -44,7 +44,8 @@ ompi_coll_base_gather_intra_binomial(const void *sbuf, size_t scount,
                                       struct ompi_datatype_t *rdtype,
                                       int root,
                                       struct ompi_communicator_t *comm,
-                                      mca_coll_base_module_t *module)
+                                      mca_coll_base_module_t *module,
+                                      mca_allocator_base_module_t *allocator)
 {
     int line = -1, i, rank, vrank, size, err;
     size_t total_recv = 0;
@@ -82,7 +83,7 @@ ompi_coll_base_gather_intra_binomial(const void *sbuf, size_t scount,
         } else {
             /* root is not on 0, allocate temp buffer for recv,
              * rotate data at the end */
-            tempbuf = (char *) malloc(rsize);
+            tempbuf = (char *) COLL_BASE_ALLOC(allocator, rsize);
             if (NULL == tempbuf) {
                 err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl;
             }
@@ -107,7 +108,7 @@ ompi_coll_base_gather_intra_binomial(const void *sbuf, size_t scount,
          * to the property of binimoal tree */
         ompi_datatype_type_extent(sdtype, &sextent);
         ssize = opal_datatype_span(&sdtype->super, (int64_t)scount * size, &sgap);
-        tempbuf = (char *) malloc(ssize);
+        tempbuf = (char *) COLL_BASE_ALLOC(allocator, ssize);
         if (NULL == tempbuf) {
             err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl;
         }
@@ -180,17 +181,16 @@ ompi_coll_base_gather_intra_binomial(const void *sbuf, size_t scount,
                                                       (char *) rbuf, ptmp + rextent * (ptrdiff_t)rcount * (ptrdiff_t)(size-root));
             if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 
-            free(tempbuf);
+            COLL_BASE_FREE(allocator, tempbuf);
         }
     } else if (!(vrank % 2)) {
         /* other non-leaf nodes */
-        free(tempbuf);
+        COLL_BASE_FREE(allocator, tempbuf);
     }
     return MPI_SUCCESS;
 
  err_hndl:
-    if (NULL != tempbuf)
-        free(tempbuf);
+    COLL_BASE_FREE(allocator, tempbuf);
 
     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,  "%s:%4d\tError occurred %d, rank %2d",
                  __FILE__, line, err, rank));
diff --git a/ompi/mca/coll/base/coll_base_reduce.c b/ompi/mca/coll/base/coll_base_reduce.c
index f127abef311..3a2a50171b0 100644
--- a/ompi/mca/coll/base/coll_base_reduce.c
+++ b/ompi/mca/coll/base/coll_base_reduce.c
@@ -38,6 +38,7 @@
 #include "ompi/mca/pml/pml.h"
 #include "ompi/op/op.h"
 #include "ompi/mca/coll/base/coll_base_functions.h"
+#include "ompi/op/op_gpu_session.h"
 #include "coll_base_topo.h"
 #include "coll_base_util.h"
 
@@ -66,7 +67,7 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, size_t or
                                     int root, ompi_communicator_t* comm,
                                     mca_coll_base_module_t *module,
                                     ompi_coll_tree_t* tree, size_t count_by_segment,
-                                    int max_outstanding_reqs )
+                                    int max_outstanding_reqs, ompi_op_gpu_session_t *session )
 {
     char *inbuf[2] = {NULL, NULL}, *inbuf_free[2] = {NULL, NULL};
     char *accumbuf = NULL, *accumbuf_free = NULL;
@@ -106,7 +107,7 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, size_t or
         if( (NULL == accumbuf) || (root != rank) ) {
             /* Allocate temporary accumulator buffer. */
             size = opal_datatype_span(&datatype->super, original_count, &gap);
-            accumbuf_free = (char*)malloc(size);
+            accumbuf_free = (char*)COLL_SESSION_ALLOC(session, size);
             if (accumbuf_free == NULL) {
                 line = __LINE__; ret = -1; goto error_hndl;
             }
@@ -123,7 +124,7 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, size_t or
         }
         /* Allocate two buffers for incoming segments */
         real_segment_size = opal_datatype_span(&datatype->super, count_by_segment, &gap);
-        inbuf_free[0] = (char*) malloc(real_segment_size);
+        inbuf_free[0] = (char*) COLL_SESSION_ALLOC(session, real_segment_size);
         if( inbuf_free[0] == NULL ) {
             line = __LINE__; ret = -1; goto error_hndl;
         }
@@ -131,7 +132,7 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, size_t or
         /* if there is chance to overlap communication -
            allocate second buffer */
         if( (num_segments > 1) || (tree->tree_nextsize > 1) ) {
-            inbuf_free[1] = (char*) malloc(real_segment_size);
+            inbuf_free[1] = (char*) COLL_SESSION_ALLOC(session, real_segment_size);
             if( inbuf_free[1] == NULL ) {
                 line = __LINE__; ret = -1; goto error_hndl;
             }
@@ -202,9 +203,9 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, size_t or
                         }
                     }
                     /* apply operation */
-                    ompi_op_reduce(op, local_op_buffer,
-                                   accumbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment,
-                                   recvcount, datatype );
+                    COLL_BASE_REDUCE(session, op, local_op_buffer,
+                                     accumbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment,
+                                     recvcount, datatype);
                 } else if ( segindex > 0 ) {
                     void* accumulator = accumbuf + (ptrdiff_t)(segindex-1) * (ptrdiff_t)segment_increment;
                     if( tree->tree_nextsize <= 1 ) {
@@ -213,8 +214,8 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, size_t or
                             local_op_buffer = sendtmpbuf + (ptrdiff_t)(segindex-1) * (ptrdiff_t)segment_increment;
                         }
                     }
-                    ompi_op_reduce(op, local_op_buffer, accumulator, prevcount,
-                                   datatype );
+                    COLL_BASE_REDUCE(session, op, local_op_buffer, accumulator, prevcount,
+                                     datatype);
 
                     /* all reduced on available data this step (i) complete,
                      * pass to the next process unless you are the root.
@@ -242,9 +243,9 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, size_t or
         } /* end of for each segment */
 
         /* clean up */
-        if( inbuf_free[0] != NULL) free(inbuf_free[0]);
-        if( inbuf_free[1] != NULL) free(inbuf_free[1]);
-        if( accumbuf_free != NULL ) free(accumbuf_free);
+        COLL_SESSION_FREE(session, inbuf_free[0]);
+        COLL_SESSION_FREE(session, inbuf_free[1]);
+        COLL_SESSION_FREE(session, accumbuf_free);
     }
 
     /* leaf nodes
@@ -365,9 +366,9 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, size_t or
         }
         ompi_coll_base_free_reqs(sreq, max_outstanding_reqs);
     }
-    if( inbuf_free[0] != NULL ) free(inbuf_free[0]);
-    if( inbuf_free[1] != NULL ) free(inbuf_free[1]);
-    if( accumbuf_free != NULL ) free(accumbuf);
+    COLL_SESSION_FREE(session, inbuf_free[0]);
+    COLL_SESSION_FREE(session, inbuf_free[1]);
+    COLL_SESSION_FREE(session, accumbuf_free);
     OPAL_OUTPUT (( ompi_coll_base_framework.framework_output,
                    "ERROR_HNDL: node %d file %s line %d error %d\n",
                    rank, __FILE__, line, ret ));
@@ -388,7 +389,7 @@ int ompi_coll_base_reduce_intra_chain( const void *sendbuf, void *recvbuf, size_
                                         ompi_communicator_t* comm,
                                         mca_coll_base_module_t *module,
                                         uint32_t segsize, int fanout,
-                                        int max_outstanding_reqs )
+                                        int max_outstanding_reqs, ompi_op_gpu_session_t *session )
 {
     size_t segcount = count;
     size_t typelng;
@@ -408,7 +409,7 @@ int ompi_coll_base_reduce_intra_chain( const void *sendbuf, void *recvbuf, size_
     return ompi_coll_base_reduce_generic( sendbuf, recvbuf, count, datatype,
                                            op, root, comm, module,
                                            data->cached_chain,
-                                           segcount, max_outstanding_reqs );
+                                           segcount, max_outstanding_reqs, session );
 }
 
 
@@ -418,7 +419,7 @@ int ompi_coll_base_reduce_intra_pipeline( const void *sendbuf, void *recvbuf,
                                            ompi_communicator_t* comm,
                                            mca_coll_base_module_t *module,
                                            uint32_t segsize,
-                                           int max_outstanding_reqs  )
+                                           int max_outstanding_reqs, ompi_op_gpu_session_t *session  )
 {
     size_t segcount = count;
     size_t typelng;
@@ -440,7 +441,7 @@ int ompi_coll_base_reduce_intra_pipeline( const void *sendbuf, void *recvbuf,
     return ompi_coll_base_reduce_generic( sendbuf, recvbuf, count, datatype,
                                            op, root, comm, module,
                                            data->cached_pipeline,
-                                           segcount, max_outstanding_reqs );
+                                           segcount, max_outstanding_reqs, session );
 }
 
 int ompi_coll_base_reduce_intra_binary( const void *sendbuf, void *recvbuf,
@@ -449,7 +450,7 @@ int ompi_coll_base_reduce_intra_binary( const void *sendbuf, void *recvbuf,
                                          ompi_communicator_t* comm,
                                          mca_coll_base_module_t *module,
                                          uint32_t segsize,
-                                         int max_outstanding_reqs  )
+                                         int max_outstanding_reqs, ompi_op_gpu_session_t *session  )
 {
     size_t segcount = count;
     size_t typelng;
@@ -471,7 +472,7 @@ int ompi_coll_base_reduce_intra_binary( const void *sendbuf, void *recvbuf,
     return ompi_coll_base_reduce_generic( sendbuf, recvbuf, count, datatype,
                                            op, root, comm, module,
                                            data->cached_bintree,
-                                           segcount, max_outstanding_reqs );
+                                           segcount, max_outstanding_reqs, session );
 }
 
 int ompi_coll_base_reduce_intra_binomial( const void *sendbuf, void *recvbuf,
@@ -480,7 +481,7 @@ int ompi_coll_base_reduce_intra_binomial( const void *sendbuf, void *recvbuf,
                                            ompi_communicator_t* comm,
                                            mca_coll_base_module_t *module,
                                            uint32_t segsize,
-                                           int max_outstanding_reqs  )
+                                           int max_outstanding_reqs, ompi_op_gpu_session_t *session  )
 {
     size_t segcount = count;
     size_t typelng;
@@ -502,7 +503,7 @@ int ompi_coll_base_reduce_intra_binomial( const void *sendbuf, void *recvbuf,
     return ompi_coll_base_reduce_generic( sendbuf, recvbuf, count, datatype,
                                            op, root, comm, module,
                                            data->cached_in_order_bmtree,
-                                           segcount, max_outstanding_reqs );
+                                           segcount, max_outstanding_reqs, session );
 }
 
 /*
@@ -519,7 +520,7 @@ int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recv
                                                   ompi_communicator_t* comm,
                                                   mca_coll_base_module_t *module,
                                                   uint32_t segsize,
-                                                  int max_outstanding_reqs  )
+                                                  int max_outstanding_reqs, ompi_op_gpu_session_t *session  )
 {
     int ret, rank, size, io_root, segcount = count;
     void *use_this_sendbuf = NULL;
@@ -560,7 +561,7 @@ int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recv
         dsize = opal_datatype_span(&datatype->super, count, &gap);
 
         if ((root == rank) && (MPI_IN_PLACE == sendbuf)) {
-            tmpbuf_free = (char *) malloc(dsize);
+            tmpbuf_free = (char *) COLL_SESSION_ALLOC(session, dsize);
             if (NULL == tmpbuf_free) {
                 return MPI_ERR_INTERN;
             }
@@ -570,7 +571,7 @@ int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recv
                                                 (char*)recvbuf);
             use_this_sendbuf = tmpbuf;
         } else if (io_root == rank) {
-            tmpbuf_free = (char *) malloc(dsize);
+            tmpbuf_free = (char *) COLL_SESSION_ALLOC(session, dsize);
             if (NULL == tmpbuf_free) {
                 return MPI_ERR_INTERN;
             }
@@ -583,9 +584,9 @@ int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recv
     ret = ompi_coll_base_reduce_generic( use_this_sendbuf, use_this_recvbuf, count, datatype,
                                           op, io_root, comm, module,
                                           data->cached_in_order_bintree,
-                                          segcount, max_outstanding_reqs );
+                                          segcount, max_outstanding_reqs, session );
     if (MPI_SUCCESS != ret) {
-        free(tmpbuf_free);
+        COLL_SESSION_FREE(session, tmpbuf_free);
         return ret;
     }
 
@@ -597,7 +598,7 @@ int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recv
                                     MCA_COLL_BASE_TAG_REDUCE, comm,
                                     MPI_STATUS_IGNORE));
             if (MPI_SUCCESS != ret) {
-                free(tmpbuf_free);
+                COLL_SESSION_FREE(session, tmpbuf_free);
                 return ret;
             }
 
@@ -607,13 +608,13 @@ int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recv
                                     MCA_COLL_BASE_TAG_REDUCE,
                                     MCA_PML_BASE_SEND_STANDARD, comm));
             if (MPI_SUCCESS != ret) {
-                free(tmpbuf_free);
+                COLL_SESSION_FREE(session, tmpbuf_free);
                 return ret;
             }
         }
     }
     if (NULL != tmpbuf_free) {
-        free(tmpbuf_free);
+        COLL_SESSION_FREE(session, tmpbuf_free);
     }
 
     return MPI_SUCCESS;
@@ -812,7 +813,7 @@ ompi_coll_base_reduce_intra_basic_linear(const void *sbuf, void *rbuf, size_t co
 int ompi_coll_base_reduce_intra_redscat_gather(
     const void *sbuf, void *rbuf, size_t count, struct ompi_datatype_t *dtype,
     struct ompi_op_t *op, int root, struct ompi_communicator_t *comm,
-    mca_coll_base_module_t *module)
+    mca_coll_base_module_t *module, ompi_op_gpu_session_t *session)
 {
     int comm_size = ompi_comm_size(comm);
     int rank = ompi_comm_rank(comm);
@@ -844,7 +845,7 @@ int ompi_coll_base_reduce_intra_redscat_gather(
 
     /* Temporary buffers */
     char *tmp_buf_raw = NULL, *rbuf_raw = NULL;
-    tmp_buf_raw = malloc(dsize);
+    tmp_buf_raw = COLL_SESSION_ALLOC(session, dsize);
     if (NULL == tmp_buf_raw) {
         err = OMPI_ERR_OUT_OF_RESOURCE;
         goto cleanup_and_return;
@@ -852,7 +853,7 @@ int ompi_coll_base_reduce_intra_redscat_gather(
     char *tmp_buf = tmp_buf_raw - gap;
 
     if (rank != root) {
-        rbuf_raw = malloc(dsize);
+        rbuf_raw = COLL_SESSION_ALLOC(session, dsize);
         if (NULL == rbuf_raw) {
             err = OMPI_ERR_OUT_OF_RESOURCE;
             goto cleanup_and_return;
@@ -906,8 +907,8 @@ int ompi_coll_base_reduce_intra_redscat_gather(
             if (MPI_SUCCESS != err) { goto cleanup_and_return; }
 
             /* Reduce on the right half of the buffers (result in rbuf) */
-            ompi_op_reduce(op, (char *)tmp_buf + (ptrdiff_t)count_lhalf * extent,
-                           (char *)rbuf + count_lhalf * extent, count_rhalf, dtype);
+            COLL_BASE_REDUCE(session, op, (char *)tmp_buf + (ptrdiff_t)count_lhalf * extent,
+                             (char *)rbuf + count_lhalf * extent, count_rhalf, dtype);
 
             /* Send the right half to the left neighbor */
             err = MCA_PML_CALL(send((char *)rbuf + (ptrdiff_t)count_lhalf * extent,
@@ -934,7 +935,7 @@ int ompi_coll_base_reduce_intra_redscat_gather(
             if (MPI_SUCCESS != err) { goto cleanup_and_return; }
 
             /* Reduce on the right half of the buffers (result in rbuf) */
-            ompi_op_reduce(op, tmp_buf, rbuf, count_lhalf, dtype);
+            COLL_BASE_REDUCE(session, op, tmp_buf, rbuf, count_lhalf, dtype);
 
             /* Recv the right half from the right neighbor */
             err = MCA_PML_CALL(recv((char *)rbuf + (ptrdiff_t)count_lhalf * extent,
@@ -1016,9 +1017,9 @@ int ompi_coll_base_reduce_intra_redscat_gather(
             if (MPI_SUCCESS != err) { goto cleanup_and_return; }
 
             /* Local reduce: rbuf[] = tmp_buf[] <op> rbuf[] */
-            ompi_op_reduce(op, (char *)tmp_buf + (ptrdiff_t)rindex[step] * extent,
-                           (char *)rbuf + (ptrdiff_t)rindex[step] * extent,
-                           rcount[step], dtype);
+            COLL_BASE_REDUCE(session, op, (char *)tmp_buf + (ptrdiff_t)rindex[step] * extent,
+                             (char *)rbuf + (ptrdiff_t)rindex[step] * extent,
+                             rcount[step], dtype);
 
             /* Move the current window to the received message */
             if (step + 1 < nsteps) {
@@ -1129,10 +1130,8 @@ int ompi_coll_base_reduce_intra_redscat_gather(
     }
 
   cleanup_and_return:
-    if (NULL != tmp_buf_raw)
-        free(tmp_buf_raw);
-    if (NULL != rbuf_raw)
-        free(rbuf_raw);
+    COLL_SESSION_FREE(session, tmp_buf_raw);
+    COLL_SESSION_FREE(session, rbuf_raw);
     if (NULL != rindex)
         free(rindex);
     if (NULL != sindex)
@@ -1170,7 +1169,7 @@ int ompi_coll_base_reduce_intra_knomial( const void *sendbuf, void *recvbuf,
                                            ompi_communicator_t* comm,
                                            mca_coll_base_module_t *module,
                                            uint32_t segsize,
-                                           int max_outstanding_reqs, int radix)
+                                           int max_outstanding_reqs, int radix, ompi_op_gpu_session_t *session)
 {
     int err = OMPI_SUCCESS, rank, line;
     ptrdiff_t extent, lb;
@@ -1215,7 +1214,7 @@ int ompi_coll_base_reduce_intra_knomial( const void *sendbuf, void *recvbuf,
         sendtmpbuf = (char *)recvbuf;
     }
     buf_size = opal_datatype_span(&datatype->super, (int64_t)count, &gap);
-    reduce_buf = (char *)malloc(buf_size);
+    reduce_buf = (char *)COLL_SESSION_ALLOC(session, buf_size);
     reduce_buf_start = reduce_buf - gap;
     err = ompi_datatype_copy_content_same_ddt(datatype, count,
                                               (char*)reduce_buf_start,
@@ -1227,7 +1226,7 @@ int ompi_coll_base_reduce_intra_knomial( const void *sendbuf, void *recvbuf,
     max_reqs = num_children;
     if(!is_leaf) {
         buf_size = opal_datatype_span(&datatype->super, (int64_t)count * num_children, &gap);
-        child_buf = (char *)malloc(buf_size);
+        child_buf = (char *)COLL_SESSION_ALLOC(session, buf_size);
         child_buf_start = child_buf - gap;
         reqs = ompi_coll_base_comm_get_reqs(data, max_reqs);
     }
@@ -1250,11 +1249,10 @@ int ompi_coll_base_reduce_intra_knomial( const void *sendbuf, void *recvbuf,
     }
 
     for (int i = 0; i < num_children; i++) {
-        ompi_op_reduce(op,
-                       child_buf_start + (ptrdiff_t)i * count * extent,
-                       reduce_buf,
-                       count,
-                       datatype);
+        COLL_BASE_REDUCE(session, op,
+                         child_buf_start + (ptrdiff_t)i * count * extent,
+                         reduce_buf,
+                         count, datatype);
     }
 
     if (rank != root) {
@@ -1275,18 +1273,18 @@ int ompi_coll_base_reduce_intra_knomial( const void *sendbuf, void *recvbuf,
         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
     }
 
-    if (NULL != child_buf) free(child_buf);
-    if (NULL != reduce_buf) free(reduce_buf);
+    COLL_SESSION_FREE(session, child_buf);
+    COLL_SESSION_FREE(session, reduce_buf);
     return MPI_SUCCESS;
 
  err_hndl:
     if (NULL != child_buf) {
-        free(child_buf);
+        COLL_SESSION_FREE(session, child_buf);
         child_buf = NULL;
         child_buf_start = NULL;
     }
     if (NULL != reduce_buf) {
-        free(reduce_buf);
+        COLL_SESSION_FREE(session, reduce_buf);
         reduce_buf = NULL;
         reduce_buf_start = NULL;
     }
diff --git a/ompi/mca/coll/base/coll_base_reduce_scatter.c b/ompi/mca/coll/base/coll_base_reduce_scatter.c
index 7a838936378..62930cccedd 100644
--- a/ompi/mca/coll/base/coll_base_reduce_scatter.c
+++ b/ompi/mca/coll/base/coll_base_reduce_scatter.c
@@ -34,6 +34,7 @@
 #include "ompi/mca/coll/base/coll_tags.h"
 #include "ompi/mca/pml/pml.h"
 #include "ompi/op/op.h"
+#include "ompi/op/op_gpu_session.h"
 #include "ompi/mca/coll/base/coll_base_functions.h"
 #include "coll_base_topo.h"
 #include "coll_base_util.h"
@@ -49,7 +50,8 @@ int ompi_coll_base_reduce_scatter_intra_nonoverlapping(const void *sbuf, void *r
                                                         struct ompi_datatype_t *dtype,
                                                         struct ompi_op_t *op,
                                                         struct ompi_communicator_t *comm,
-                                                        mca_coll_base_module_t *module)
+                                                        mca_coll_base_module_t *module,
+                                                        ompi_op_gpu_session_t *session)
 {
     int err, i, rank, size, total_count;
     ptrdiff_t *displs = NULL;
@@ -82,14 +84,14 @@ int ompi_coll_base_reduce_scatter_intra_nonoverlapping(const void *sbuf, void *r
             ptrdiff_t dsize, gap = 0;
             dsize = opal_datatype_span(&dtype->super, total_count, &gap);
 
-            tmprbuf_free = (char*) malloc(dsize);
+            tmprbuf_free = (char*) COLL_SESSION_ALLOC(session, dsize);
             tmprbuf = tmprbuf_free - gap;
         }
         err = comm->c_coll->coll_reduce (sbuf, tmprbuf, total_count,
                                         dtype, op, root, comm, comm->c_coll->coll_reduce_module);
     }
     if (MPI_SUCCESS != err) {
-        if (NULL != tmprbuf_free) free(tmprbuf_free);
+        COLL_SESSION_FREE(session, tmprbuf_free);
         return err;
     }
 
@@ -109,7 +111,7 @@ int ompi_coll_base_reduce_scatter_intra_nonoverlapping(const void *sbuf, void *r
                                            root, comm, comm->c_coll->coll_scatterv_module);
     }
     free(displs);
-    if (NULL != tmprbuf_free) free(tmprbuf_free);
+    COLL_SESSION_FREE(session, tmprbuf_free);
 
     return err;
 }
@@ -138,7 +140,8 @@ ompi_coll_base_reduce_scatter_intra_basic_recursivehalving( const void *sbuf,
                                                             struct ompi_datatype_t *dtype,
                                                             struct ompi_op_t *op,
                                                             struct ompi_communicator_t *comm,
-                                                            mca_coll_base_module_t *module)
+                                                            mca_coll_base_module_t *module,
+                                                            ompi_op_gpu_session_t *session)
 {
     int i, rank, size, err = OMPI_SUCCESS;
     int tmp_size, remain = 0, tmp_rank;
@@ -180,7 +183,7 @@ ompi_coll_base_reduce_scatter_intra_basic_recursivehalving( const void *sbuf,
     }
 
     /* Allocate temporary receive buffer. */
-    recv_buf_free = (char*) malloc(buf_size);
+    recv_buf_free = (char*) COLL_SESSION_ALLOC(session, buf_size);
     recv_buf = recv_buf_free - gap;
     if (NULL == recv_buf_free) {
         err = OMPI_ERR_OUT_OF_RESOURCE;
@@ -188,7 +191,7 @@ ompi_coll_base_reduce_scatter_intra_basic_recursivehalving( const void *sbuf,
     }
 
     /* allocate temporary buffer for results */
-    result_buf_free = (char*) malloc(buf_size);
+    result_buf_free = (char*) COLL_SESSION_ALLOC(session, buf_size);
     result_buf = result_buf_free - gap;
 
     /* copy local buffer into the temporary results */
@@ -221,7 +224,7 @@ ompi_coll_base_reduce_scatter_intra_basic_recursivehalving( const void *sbuf,
                                     comm, MPI_STATUS_IGNORE));
 
             /* integrate their results into our temp results */
-            ompi_op_reduce(op, recv_buf, result_buf, count, dtype);
+            COLL_BASE_REDUCE(session, op, recv_buf, result_buf, count, dtype);
 
             /* adjust rank to be the bottom "remain" ranks */
             tmp_rank = rank / 2;
@@ -339,10 +342,10 @@ ompi_coll_base_reduce_scatter_intra_basic_recursivehalving( const void *sbuf,
                     goto cleanup;
                 }
 
-                ompi_op_reduce(op,
-                               recv_buf + tmp_disps[recv_index] * extent,
-                               result_buf + tmp_disps[recv_index] * extent,
-                               recv_count, dtype);
+                COLL_BASE_REDUCE(session, op,
+                                 recv_buf + tmp_disps[recv_index] * extent,
+                                 result_buf + tmp_disps[recv_index] * extent,
+                                 recv_count, dtype);
             }
 
             /* update for next iteration */
@@ -391,8 +394,8 @@ ompi_coll_base_reduce_scatter_intra_basic_recursivehalving( const void *sbuf,
 
  cleanup:
     if (NULL != disps) free(disps);
-    if (NULL != recv_buf_free) free(recv_buf_free);
-    if (NULL != result_buf_free) free(result_buf_free);
+    COLL_SESSION_FREE(session, recv_buf_free);
+    COLL_SESSION_FREE(session, result_buf_free);
 
     return err;
 }
@@ -464,7 +467,8 @@ ompi_coll_base_reduce_scatter_intra_ring( const void *sbuf, void *rbuf, ompi_cou
                                           struct ompi_datatype_t *dtype,
                                           struct ompi_op_t *op,
                                           struct ompi_communicator_t *comm,
-                                          mca_coll_base_module_t *module)
+                                          mca_coll_base_module_t *module,
+                                          ompi_op_gpu_session_t *session)
 {
     int ret, line, rank, size, i, k, recv_from, send_to;
     int inbi;
@@ -518,15 +522,15 @@ ompi_coll_base_reduce_scatter_intra_ring( const void *sbuf, void *rbuf, ompi_cou
     max_real_segsize = opal_datatype_span(&dtype->super, max_block_count, &gap);
     dsize = opal_datatype_span(&dtype->super, total_count, &gap);
 
-    accumbuf_free = (char*)malloc(dsize);
+    accumbuf_free = (char*)COLL_SESSION_ALLOC(session, dsize);
     if (NULL == accumbuf_free) { ret = -1; line = __LINE__; goto error_hndl; }
     accumbuf = accumbuf_free - gap;
 
-    inbuf_free[0] = (char*)malloc(max_real_segsize);
+    inbuf_free[0] = (char*)COLL_SESSION_ALLOC(session, max_real_segsize);
     if (NULL == inbuf_free[0]) { ret = -1; line = __LINE__; goto error_hndl; }
     inbuf[0] = inbuf_free[0] - gap;
     if (size > 2) {
-        inbuf_free[1] = (char*)malloc(max_real_segsize);
+        inbuf_free[1] = (char*)COLL_SESSION_ALLOC(session, max_real_segsize);
         if (NULL == inbuf_free[1]) { ret = -1; line = __LINE__; goto error_hndl; }
         inbuf[1] = inbuf_free[1] - gap;
     }
@@ -591,7 +595,7 @@ ompi_coll_base_reduce_scatter_intra_ring( const void *sbuf, void *rbuf, ompi_cou
            rbuf[prevblock] = inbuf[inbi ^ 0x1] (op) rbuf[prevblock]
         */
         tmprecv = accumbuf + displs[prevblock] * extent;
-        ompi_op_reduce(op, inbuf[inbi ^ 0x1], tmprecv, ompi_count_array_get(rcounts, prevblock), dtype);
+        COLL_BASE_REDUCE(session, op, inbuf[inbi ^ 0x1], tmprecv, ompi_count_array_get(rcounts, prevblock), dtype);
 
         /* send previous block to send_to */
         ret = MCA_PML_CALL(send(tmprecv, ompi_count_array_get(rcounts, prevblock), dtype, send_to,
@@ -607,7 +611,7 @@ ompi_coll_base_reduce_scatter_intra_ring( const void *sbuf, void *rbuf, ompi_cou
     /* Apply operation on the last block (my block)
        rbuf[rank] = inbuf[inbi] (op) rbuf[rank] */
     tmprecv = accumbuf + displs[rank] * extent;
-    ompi_op_reduce(op, inbuf[inbi], tmprecv, ompi_count_array_get(rcounts, rank), dtype);
+    COLL_BASE_REDUCE(session, op, inbuf[inbi], tmprecv, ompi_count_array_get(rcounts, rank), dtype);
 
     /* Copy result from tmprecv to rbuf */
     ret = ompi_datatype_copy_content_same_ddt(dtype, ompi_count_array_get(rcounts, rank),
@@ -615,9 +619,9 @@ ompi_coll_base_reduce_scatter_intra_ring( const void *sbuf, void *rbuf, ompi_cou
     if (ret < 0) { line = __LINE__; goto error_hndl; }
 
     if (NULL != displs) free(displs);
-    if (NULL != accumbuf_free) free(accumbuf_free);
-    if (NULL != inbuf_free[0]) free(inbuf_free[0]);
-    if (NULL != inbuf_free[1]) free(inbuf_free[1]);
+    COLL_SESSION_FREE(session, accumbuf_free);
+    COLL_SESSION_FREE(session, inbuf_free[0]);
+    COLL_SESSION_FREE(session, inbuf_free[1]);
 
     return MPI_SUCCESS;
 
@@ -626,9 +630,9 @@ ompi_coll_base_reduce_scatter_intra_ring( const void *sbuf, void *rbuf, ompi_cou
                  __FILE__, line, rank, ret));
     (void)line;  // silence compiler warning
     if (NULL != displs) free(displs);
-    if (NULL != accumbuf_free) free(accumbuf_free);
-    if (NULL != inbuf_free[0]) free(inbuf_free[0]);
-    if (NULL != inbuf_free[1]) free(inbuf_free[1]);
+    COLL_SESSION_FREE(session, accumbuf_free);
+    COLL_SESSION_FREE(session, inbuf_free[0]);
+    COLL_SESSION_FREE(session, inbuf_free[1]);
     return ret;
 }
 
@@ -701,7 +705,7 @@ int
 ompi_coll_base_reduce_scatter_intra_butterfly(
     const void *sbuf, void *rbuf, ompi_count_array_t rcounts, struct ompi_datatype_t *dtype,
     struct ompi_op_t *op, struct ompi_communicator_t *comm,
-    mca_coll_base_module_t *module)
+    mca_coll_base_module_t *module, ompi_op_gpu_session_t *session)
 {
     char *tmpbuf[2] = {NULL, NULL}, *psend, *precv;
     ptrdiff_t *displs = NULL, index;
@@ -729,8 +733,8 @@ ompi_coll_base_reduce_scatter_intra_butterfly(
 
     ompi_datatype_type_extent(dtype, &extent);
     span = opal_datatype_span(&dtype->super, totalcount, &gap);
-    tmpbuf[0] = malloc(span);
-    tmpbuf[1] = malloc(span);
+    tmpbuf[0] = COLL_SESSION_ALLOC(session, span);
+    tmpbuf[1] = COLL_SESSION_ALLOC(session, span);
     if (NULL == tmpbuf[0] || NULL == tmpbuf[1]) {
         err = OMPI_ERR_OUT_OF_RESOURCE;
         goto cleanup_and_return;
@@ -781,7 +785,7 @@ ompi_coll_base_reduce_scatter_intra_butterfly(
                                     MCA_COLL_BASE_TAG_REDUCE_SCATTER,
                                     comm, MPI_STATUS_IGNORE));
             if (OMPI_SUCCESS != err) { goto cleanup_and_return; }
-            ompi_op_reduce(op, precv, psend, totalcount, dtype);
+            COLL_BASE_REDUCE(session, op, precv, psend, totalcount, dtype);
             /* Adjust rank to be the bottom "remain" ranks */
             vrank = rank / 2;
         }
@@ -837,15 +841,15 @@ ompi_coll_base_reduce_scatter_intra_butterfly(
 
             if (vrank < vpeer) {
                 /* precv = psend <op> precv */
-                ompi_op_reduce(op, psend + rdispl * extent,
-                               precv + rdispl * extent, recv_count, dtype);
+                COLL_BASE_REDUCE(session, op, psend + rdispl * extent,
+                                 precv + rdispl * extent, recv_count, dtype);
                 char *p = psend;
                 psend = precv;
                 precv = p;
             } else {
                 /* psend = precv <op> psend */
-                ompi_op_reduce(op, precv + rdispl * extent,
-                               psend + rdispl * extent, recv_count, dtype);
+                COLL_BASE_REDUCE(session, op, precv + rdispl * extent,
+                                 psend + rdispl * extent, recv_count, dtype);
             }
             send_index = recv_index;
         }
@@ -899,9 +903,7 @@ ompi_coll_base_reduce_scatter_intra_butterfly(
 cleanup_and_return:
     if (displs)
         free(displs);
-    if (tmpbuf[0])
-        free(tmpbuf[0]);
-    if (tmpbuf[1])
-        free(tmpbuf[1]);
+    COLL_SESSION_FREE(session, tmpbuf[0]);
+    COLL_SESSION_FREE(session, tmpbuf[1]);
     return err;
 }
diff --git a/ompi/mca/coll/base/coll_base_reduce_scatter_block.c b/ompi/mca/coll/base/coll_base_reduce_scatter_block.c
index ca4a6989bec..19319b67ff8 100644
--- a/ompi/mca/coll/base/coll_base_reduce_scatter_block.c
+++ b/ompi/mca/coll/base/coll_base_reduce_scatter_block.c
@@ -38,6 +38,7 @@
 #include "ompi/mca/coll/basic/coll_basic.h"
 #include "ompi/mca/pml/pml.h"
 #include "ompi/op/op.h"
+#include "ompi/op/op_gpu_session.h"
 #include "coll_tags.h"
 #include "coll_base_functions.h"
 #include "coll_base_topo.h"
@@ -59,7 +60,8 @@ ompi_coll_base_reduce_scatter_block_basic_linear(const void *sbuf, void *rbuf, s
                                                  struct ompi_datatype_t *dtype,
                                                  struct ompi_op_t *op,
                                                  struct ompi_communicator_t *comm,
-                                                 mca_coll_base_module_t *module)
+                                                 mca_coll_base_module_t *module,
+                                                 ompi_op_gpu_session_t *session)
 {
     int rank, size, err = OMPI_SUCCESS;
     size_t count;
@@ -101,7 +103,7 @@ ompi_coll_base_reduce_scatter_block_basic_linear(const void *sbuf, void *rbuf, s
         if (0 == rank) {
             /* temporary receive buffer.  See coll_basic_reduce.c for
                details on sizing */
-            recv_buf_free = (char*) malloc(span);
+            recv_buf_free = (char*) COLL_SESSION_ALLOC(session, span);
             if (NULL == recv_buf_free) {
                 err = OMPI_ERR_OUT_OF_RESOURCE;
                 goto cleanup;
@@ -151,7 +153,7 @@ ompi_coll_base_reduce_scatter_block_basic_linear(const void *sbuf, void *rbuf, s
         if (0 == rank) {
             /* temporary receive buffer.  See coll_basic_reduce.c for
                details on sizing */
-            recv_buf_free = (char*) malloc(span);
+            recv_buf_free = (char*) COLL_SESSION_ALLOC(session, span);
             if (NULL == recv_buf_free) {
                 err = OMPI_ERR_OUT_OF_RESOURCE;
                 goto cleanup;
@@ -174,7 +176,7 @@ ompi_coll_base_reduce_scatter_block_basic_linear(const void *sbuf, void *rbuf, s
     }
 
  cleanup:
-    if (NULL != recv_buf_free) free(recv_buf_free);
+    COLL_SESSION_FREE(session, recv_buf_free);
 
     return err;
 }
@@ -198,7 +200,7 @@ int
 ompi_coll_base_reduce_scatter_block_intra_recursivedoubling(
     const void *sbuf, void *rbuf, size_t rcount, struct ompi_datatype_t *dtype,
     struct ompi_op_t *op, struct ompi_communicator_t *comm,
-    mca_coll_base_module_t *module)
+    mca_coll_base_module_t *module, ompi_op_gpu_session_t *session)
 {
     struct ompi_datatype_t *dtypesend = NULL, *dtyperecv = NULL;
     char *tmprecv_raw = NULL, *tmpbuf_raw = NULL, *tmprecv, *tmpbuf;
@@ -225,12 +227,12 @@ ompi_coll_base_reduce_scatter_block_intra_recursivedoubling(
          * will overflow an int data type.
          * Fallback to the linear algorithm.
          */
-        return ompi_coll_base_reduce_scatter_block_basic_linear(sbuf, rbuf, rcount, dtype, op, comm, module);
+        return ompi_coll_base_reduce_scatter_block_basic_linear(sbuf, rbuf, rcount, dtype, op, comm, module, session);
     }
     ompi_datatype_type_extent(dtype, &extent);
     span = opal_datatype_span(&dtype->super, totalcount, &gap);
-    tmpbuf_raw = malloc(span);
-    tmprecv_raw = malloc(span);
+    tmpbuf_raw = COLL_SESSION_ALLOC(session, span);
+    tmprecv_raw = COLL_SESSION_ALLOC(session, span);
     if (NULL == tmpbuf_raw || NULL == tmprecv_raw) {
         err = OMPI_ERR_OUT_OF_RESOURCE;
         goto cleanup_and_return;
@@ -340,15 +342,15 @@ ompi_coll_base_reduce_scatter_block_intra_recursivedoubling(
         if (is_block_received) {
             /* After reduction the result must be in tmpbuf */
             if (is_commutative || (remote_tree_root < cur_tree_root)) {
-                ompi_op_reduce(op, tmprecv, tmpbuf, blocklens[0], dtype);
-                ompi_op_reduce(op, tmprecv + (ptrdiff_t)displs[1] * extent,
-                               tmpbuf + (ptrdiff_t)displs[1] * extent,
-                               blocklens[1], dtype);
+                COLL_BASE_REDUCE(session, op, tmprecv, tmpbuf, blocklens[0], dtype);
+                COLL_BASE_REDUCE(session, op, tmprecv + (ptrdiff_t)displs[1] * extent,
+                                 tmpbuf + (ptrdiff_t)displs[1] * extent,
+                                 blocklens[1], dtype);
             } else {
-                ompi_op_reduce(op, tmpbuf, tmprecv, blocklens[0], dtype);
-                ompi_op_reduce(op, tmpbuf + (ptrdiff_t)displs[1] * extent,
-                               tmprecv + (ptrdiff_t)displs[1] * extent,
-                               blocklens[1], dtype);
+                COLL_BASE_REDUCE(session, op, tmpbuf, tmprecv, blocklens[0], dtype);
+                COLL_BASE_REDUCE(session, op, tmpbuf + (ptrdiff_t)displs[1] * extent,
+                                 tmprecv + (ptrdiff_t)displs[1] * extent,
+                                 blocklens[1], dtype);
                 err = ompi_datatype_copy_content_same_ddt(dtyperecv, 1,
                                                           tmpbuf, tmprecv);
                 if (MPI_SUCCESS != err) { goto cleanup_and_return; }
@@ -368,10 +370,8 @@ ompi_coll_base_reduce_scatter_block_intra_recursivedoubling(
         ompi_datatype_destroy(&dtypesend);
     if (dtyperecv)
         ompi_datatype_destroy(&dtyperecv);
-    if (tmpbuf_raw)
-        free(tmpbuf_raw);
-    if (tmprecv_raw)
-        free(tmprecv_raw);
+    COLL_SESSION_FREE(session, tmpbuf_raw);
+    COLL_SESSION_FREE(session, tmprecv_raw);
     return err;
 }
 
@@ -406,7 +406,7 @@ int
 ompi_coll_base_reduce_scatter_block_intra_recursivehalving(
     const void *sbuf, void *rbuf, size_t rcount, struct ompi_datatype_t *dtype,
     struct ompi_op_t *op, struct ompi_communicator_t *comm,
-    mca_coll_base_module_t *module)
+    mca_coll_base_module_t *module, ompi_op_gpu_session_t *session)
 {
     char *tmprecv_raw = NULL, *tmpbuf_raw = NULL, *tmprecv, *tmpbuf;
     ptrdiff_t span, gap, totalcount, extent;
@@ -425,14 +425,14 @@ ompi_coll_base_reduce_scatter_block_intra_recursivehalving(
                      "coll:base:reduce_scatter_block_intra_recursivehalving: rank %d/%d "
                      "switching to basic reduce_scatter_block", rank, comm_size));
         return ompi_coll_base_reduce_scatter_block_basic_linear(sbuf, rbuf, rcount, dtype,
-                                                                op, comm, module);
+                                                                op, comm, module, session);
     }
 
     totalcount = comm_size * (size_t)rcount;
     ompi_datatype_type_extent(dtype, &extent);
     span = opal_datatype_span(&dtype->super, totalcount, &gap);
-    tmpbuf_raw = malloc(span);
-    tmprecv_raw = malloc(span);
+    tmpbuf_raw = COLL_SESSION_ALLOC(session, span);
+    tmprecv_raw = COLL_SESSION_ALLOC(session, span);
     if (NULL == tmpbuf_raw || NULL == tmprecv_raw) {
         err = OMPI_ERR_OUT_OF_RESOURCE;
         goto cleanup_and_return;
@@ -482,7 +482,7 @@ ompi_coll_base_reduce_scatter_block_intra_recursivehalving(
                                     MCA_COLL_BASE_TAG_REDUCE_SCATTER_BLOCK,
                                     comm, MPI_STATUS_IGNORE));
             if (OMPI_SUCCESS != err) { goto cleanup_and_return; }
-            ompi_op_reduce(op, tmprecv, tmpbuf, totalcount, dtype);
+            COLL_BASE_REDUCE(session, op, tmprecv, tmpbuf, totalcount, dtype);
             /* Adjust rank to be the bottom "remain" ranks */
             vrank = rank / 2;
         }
@@ -546,8 +546,8 @@ ompi_coll_base_reduce_scatter_block_intra_recursivehalving(
             if (recv_count > 0) {
                 err = ompi_request_wait(&request, MPI_STATUS_IGNORE);
                 if (OMPI_SUCCESS != err) { goto cleanup_and_return; }
-                ompi_op_reduce(op, tmprecv + rdispl * extent,
-                               tmpbuf + rdispl * extent, recv_count, dtype);
+                COLL_BASE_REDUCE(session, op, tmprecv + rdispl * extent,
+                                 tmpbuf + rdispl * extent, recv_count, dtype);
             }
             send_index = recv_index;
             last_index = recv_index + mask;
@@ -576,17 +576,15 @@ ompi_coll_base_reduce_scatter_block_intra_recursivehalving(
     }
 
 cleanup_and_return:
-    if (tmpbuf_raw)
-        free(tmpbuf_raw);
-    if (tmprecv_raw)
-        free(tmprecv_raw);
+    COLL_SESSION_FREE(session, tmpbuf_raw);
+    COLL_SESSION_FREE(session, tmprecv_raw);
     return err;
 }
 
 static int ompi_coll_base_reduce_scatter_block_intra_butterfly_pof2(
     const void *sbuf, void *rbuf, size_t rcount, struct ompi_datatype_t *dtype,
     struct ompi_op_t *op, struct ompi_communicator_t *comm,
-    mca_coll_base_module_t *module);
+    mca_coll_base_module_t *module, ompi_op_gpu_session_t *session);
 
 /*
  * ompi_coll_base_reduce_scatter_block_intra_butterfly
@@ -648,7 +646,7 @@ int
 ompi_coll_base_reduce_scatter_block_intra_butterfly(
     const void *sbuf, void *rbuf, size_t rcount, struct ompi_datatype_t *dtype,
     struct ompi_op_t *op, struct ompi_communicator_t *comm,
-    mca_coll_base_module_t *module)
+    mca_coll_base_module_t *module, ompi_op_gpu_session_t *session)
 {
     char *tmpbuf[2] = {NULL, NULL}, *psend, *precv;
     ptrdiff_t span, gap, totalcount, extent;
@@ -665,14 +663,14 @@ ompi_coll_base_reduce_scatter_block_intra_butterfly(
     if (!(comm_size & (comm_size - 1))) {
         /* Special case: comm_size is a power of two */
         return ompi_coll_base_reduce_scatter_block_intra_butterfly_pof2(
-                   sbuf, rbuf, rcount, dtype, op, comm, module);
+                   sbuf, rbuf, rcount, dtype, op, comm, module, session);
     }
 
     totalcount = comm_size * (size_t)rcount;
     ompi_datatype_type_extent(dtype, &extent);
     span = opal_datatype_span(&dtype->super, totalcount, &gap);
-    tmpbuf[0] = malloc(span);
-    tmpbuf[1] = malloc(span);
+    tmpbuf[0] = COLL_SESSION_ALLOC(session, span);
+    tmpbuf[1] = COLL_SESSION_ALLOC(session, span);
     if (NULL == tmpbuf[0] || NULL == tmpbuf[1]) {
         err = OMPI_ERR_OUT_OF_RESOURCE;
         goto cleanup_and_return;
@@ -723,7 +721,7 @@ ompi_coll_base_reduce_scatter_block_intra_butterfly(
                                     MCA_COLL_BASE_TAG_REDUCE_SCATTER_BLOCK,
                                     comm, MPI_STATUS_IGNORE));
             if (OMPI_SUCCESS != err) { goto cleanup_and_return; }
-            ompi_op_reduce(op, precv, psend, totalcount, dtype);
+            COLL_BASE_REDUCE(session, op, precv, psend, totalcount, dtype);
             /* Adjust rank to be the bottom "remain" ranks */
             vrank = rank / 2;
         }
@@ -780,15 +778,15 @@ ompi_coll_base_reduce_scatter_block_intra_butterfly(
 
             if (vrank < vpeer) {
                 /* precv = psend <op> precv */
-                ompi_op_reduce(op, psend + (ptrdiff_t)rdispl * extent,
-                               precv + (ptrdiff_t)rdispl * extent, recv_count, dtype);
+                COLL_BASE_REDUCE(session, op, psend + (ptrdiff_t)rdispl * extent,
+                                 precv + (ptrdiff_t)rdispl * extent, recv_count, dtype);
                 char *p = psend;
                 psend = precv;
                 precv = p;
             } else {
                 /* psend = precv <op> psend */
-                ompi_op_reduce(op, precv + (ptrdiff_t)rdispl * extent,
-                               psend + (ptrdiff_t)rdispl * extent, recv_count, dtype);
+                COLL_BASE_REDUCE(session, op, precv + (ptrdiff_t)rdispl * extent,
+                                 psend + (ptrdiff_t)rdispl * extent, recv_count, dtype);
             }
             send_index = recv_index;
         }
@@ -843,10 +841,8 @@ ompi_coll_base_reduce_scatter_block_intra_butterfly(
     }
 
 cleanup_and_return:
-    if (tmpbuf[0])
-        free(tmpbuf[0]);
-    if (tmpbuf[1])
-        free(tmpbuf[1]);
+    COLL_SESSION_FREE(session, tmpbuf[0]);
+    COLL_SESSION_FREE(session, tmpbuf[1]);
     return err;
 }
 
@@ -895,7 +891,7 @@ static int
 ompi_coll_base_reduce_scatter_block_intra_butterfly_pof2(
     const void *sbuf, void *rbuf, size_t rcount, struct ompi_datatype_t *dtype,
     struct ompi_op_t *op, struct ompi_communicator_t *comm,
-    mca_coll_base_module_t *module)
+    mca_coll_base_module_t *module, ompi_op_gpu_session_t *session)
 {
     char *tmpbuf[2] = {NULL, NULL}, *psend, *precv;
     ptrdiff_t span, gap, totalcount, extent;
@@ -909,8 +905,8 @@ ompi_coll_base_reduce_scatter_block_intra_butterfly_pof2(
     totalcount = comm_size * (size_t)rcount;
     ompi_datatype_type_extent(dtype, &extent);
     span = opal_datatype_span(&dtype->super, totalcount, &gap);
-    tmpbuf[0] = malloc(span);
-    tmpbuf[1] = malloc(span);
+    tmpbuf[0] = COLL_SESSION_ALLOC(session, span);
+    tmpbuf[1] = COLL_SESSION_ALLOC(session, span);
     if (NULL == tmpbuf[0] || NULL == tmpbuf[1]) {
         err = OMPI_ERR_OUT_OF_RESOURCE;
         goto cleanup_and_return;
@@ -951,15 +947,15 @@ ompi_coll_base_reduce_scatter_block_intra_butterfly_pof2(
 
         if (rank < peer) {
             /* precv = psend <op> precv */
-            ompi_op_reduce(op, psend + (ptrdiff_t)recv_index * extent,
-                           precv + (ptrdiff_t)recv_index * extent, nblocks, dtype);
+            COLL_BASE_REDUCE(session, op, psend + (ptrdiff_t)recv_index * extent,
+                             precv + (ptrdiff_t)recv_index * extent, nblocks, dtype);
             char *p = psend;
             psend = precv;
             precv = p;
         } else {
             /* psend = precv <op> psend */
-            ompi_op_reduce(op, precv + (ptrdiff_t)recv_index * extent,
-                           psend + (ptrdiff_t)recv_index * extent, nblocks, dtype);
+            COLL_BASE_REDUCE(session, op, precv + (ptrdiff_t)recv_index * extent,
+                             psend + (ptrdiff_t)recv_index * extent, nblocks, dtype);
         }
         send_index = recv_index;
     }
@@ -969,9 +965,7 @@ ompi_coll_base_reduce_scatter_block_intra_butterfly_pof2(
     if (MPI_SUCCESS != err) { goto cleanup_and_return; }
 
 cleanup_and_return:
-    if (tmpbuf[0])
-        free(tmpbuf[0]);
-    if (tmpbuf[1])
-        free(tmpbuf[1]);
+    COLL_SESSION_FREE(session, tmpbuf[0]);
+    COLL_SESSION_FREE(session, tmpbuf[1]);
     return err;
 }
diff --git a/ompi/mca/coll/base/coll_base_scan.c b/ompi/mca/coll/base/coll_base_scan.c
index 9ac99ed255e..3cf663432ee 100644
--- a/ompi/mca/coll/base/coll_base_scan.c
+++ b/ompi/mca/coll/base/coll_base_scan.c
@@ -23,6 +23,7 @@
 #include "ompi/mca/coll/base/coll_base_util.h"
 #include "ompi/mca/pml/pml.h"
 #include "ompi/op/op.h"
+#include "ompi/op/op_gpu_session.h"
 
 /*
  * ompi_coll_base_scan_intra_linear
@@ -157,7 +158,7 @@ ompi_coll_base_scan_intra_linear(const void *sbuf, void *rbuf, size_t count,
 int ompi_coll_base_scan_intra_recursivedoubling(
     const void *sendbuf, void *recvbuf, size_t count, struct ompi_datatype_t *datatype,
     struct ompi_op_t *op, struct ompi_communicator_t *comm,
-    mca_coll_base_module_t *module)
+    mca_coll_base_module_t *module, ompi_op_gpu_session_t *session)
 {
     int err = MPI_SUCCESS;
     char *tmpsend_raw = NULL, *tmprecv_raw = NULL;
@@ -179,8 +180,8 @@ int ompi_coll_base_scan_intra_recursivedoubling(
 
     ptrdiff_t dsize, gap;
     dsize = opal_datatype_span(&datatype->super, count, &gap);
-    tmpsend_raw = malloc(dsize);
-    tmprecv_raw = malloc(dsize);
+    tmpsend_raw = COLL_SESSION_ALLOC(session, dsize);
+    tmprecv_raw = COLL_SESSION_ALLOC(session, dsize);
     if (NULL == tmpsend_raw || NULL == tmprecv_raw) {
         err = OMPI_ERR_OUT_OF_RESOURCE;
         goto cleanup_and_return;
@@ -203,16 +204,16 @@ int ompi_coll_base_scan_intra_recursivedoubling(
 
             if (rank > remote) {
                 /* Accumulate prefix reduction: recvbuf = precv <op> recvbuf */
-                ompi_op_reduce(op, precv, recvbuf, count, datatype);
+                COLL_BASE_REDUCE(session, op, precv, recvbuf, count, datatype);
                 /* Partial result: psend = precv <op> psend */
-                ompi_op_reduce(op, precv, psend, count, datatype);
+                COLL_BASE_REDUCE(session, op, precv, psend, count, datatype);
             } else {
                 if (is_commute) {
                     /* psend = precv <op> psend */
-                    ompi_op_reduce(op, precv, psend, count, datatype);
+                    COLL_BASE_REDUCE(session, op, precv, psend, count, datatype);
                 } else {
                     /* precv = psend <op> precv */
-                    ompi_op_reduce(op, psend, precv, count, datatype);
+                    COLL_BASE_REDUCE(session, op, psend, precv, count, datatype);
                     char *tmp = psend;
                     psend = precv;
                     precv = tmp;
@@ -222,9 +223,7 @@ int ompi_coll_base_scan_intra_recursivedoubling(
     }
 
 cleanup_and_return:
-    if (NULL != tmpsend_raw)
-        free(tmpsend_raw);
-    if (NULL != tmprecv_raw)
-        free(tmprecv_raw);
+    COLL_SESSION_FREE(session, tmpsend_raw);
+    COLL_SESSION_FREE(session, tmprecv_raw);
     return err;
 }
diff --git a/ompi/mca/coll/base/coll_base_scatter.c b/ompi/mca/coll/base/coll_base_scatter.c
index 795f79a5c72..87b799acc0e 100644
--- a/ompi/mca/coll/base/coll_base_scatter.c
+++ b/ompi/mca/coll/base/coll_base_scatter.c
@@ -64,7 +64,8 @@ ompi_coll_base_scatter_intra_binomial(
     const void *sbuf, size_t scount, struct ompi_datatype_t *sdtype,
     void *rbuf, size_t rcount, struct ompi_datatype_t *rdtype,
     int root, struct ompi_communicator_t *comm,
-    mca_coll_base_module_t *module)
+    mca_coll_base_module_t *module,
+    mca_allocator_base_module_t *allocator)
 {
     mca_coll_base_module_t *base_module = (mca_coll_base_module_t*)module;
     mca_coll_base_comm_t *data = base_module->base_data;
@@ -110,7 +111,7 @@ ompi_coll_base_scatter_intra_binomial(
             opal_convertor_get_packed_size( &convertor, &packed_sizet );
             packed_size = packed_sizet;
             packed_sizet = packed_sizet / size;
-            ptmp = tempbuf = (char *)malloc(packed_size);
+            ptmp = tempbuf = (char *) COLL_BASE_ALLOC(allocator, packed_size);
             if (NULL == tempbuf) {
                 err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl;
             }
@@ -147,7 +148,7 @@ ompi_coll_base_scatter_intra_binomial(
             subtree_size = size - vrank;
         packed_size = scount * subtree_size;
 
-        ptmp = tempbuf = (char *)malloc(packed_size);
+        ptmp = tempbuf = (char *) COLL_BASE_ALLOC(allocator, packed_size);
         if (NULL == tempbuf) {
             err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl;
         }
@@ -185,13 +186,13 @@ ompi_coll_base_scatter_intra_binomial(
         curr_count -= send_count;
     }
     if (NULL != tempbuf)
-        free(tempbuf);
+        COLL_BASE_FREE(allocator, tempbuf);
 
     return MPI_SUCCESS;
 
  err_hndl:
     if (NULL != tempbuf)
-        free(tempbuf);
+        COLL_BASE_FREE(allocator, tempbuf);
 
     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,  "%s:%4d\tError occurred %d, rank %2d",
                  __FILE__, line, err, rank));
diff --git a/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c b/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c
index 5eb7adfda50..136255fe145 100644
--- a/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c
+++ b/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c
@@ -59,7 +59,7 @@ mca_coll_basic_reduce_scatter_block_intra(const void *sbuf, void *rbuf, size_t r
                                           struct ompi_communicator_t *comm,
                                           mca_coll_base_module_t *module)
 {
-    return ompi_coll_base_reduce_scatter_block_basic_linear(sbuf, rbuf, rcount, dtype, op, comm, module);
+    return ompi_coll_base_reduce_scatter_block_basic_linear(sbuf, rbuf, rcount, dtype, op, comm, module, NULL);
 }
 
 /*
diff --git a/ompi/mca/coll/tuned/coll_tuned.h b/ompi/mca/coll/tuned/coll_tuned.h
index 53bb8705aa0..d0b9f82f288 100644
--- a/ompi/mca/coll/tuned/coll_tuned.h
+++ b/ompi/mca/coll/tuned/coll_tuned.h
@@ -26,6 +26,7 @@
 #include "ompi/request/request.h"
 #include "ompi/mca/coll/base/coll_base_functions.h"
 #include "opal/util/output.h"
+#include "ompi/op/op_gpu_session.h"
 
 /* also need the dynamic rule structures */
 #include "coll_tuned_dynamic_rules.h"
@@ -102,7 +103,7 @@ ompi_coll_tuned_comm_query(struct ompi_communicator_t *comm, int *priority);
 /* All Gather */
 int ompi_coll_tuned_allgather_intra_dec_fixed(ALLGATHER_ARGS);
 int ompi_coll_tuned_allgather_intra_dec_dynamic(ALLGATHER_ARGS);
-int ompi_coll_tuned_allgather_intra_do_this(ALLGATHER_ARGS, int algorithm, int faninout, int segsize);
+int ompi_coll_tuned_allgather_intra_do_this(ALLGATHER_ARGS, int algorithm, int faninout, int segsize, mca_allocator_base_module_t *allocator);
 int ompi_coll_tuned_allgather_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
 
 /* All GatherV */
@@ -115,7 +116,7 @@ int ompi_coll_tuned_allgatherv_intra_check_forced_init(coll_tuned_force_algorith
 int ompi_coll_tuned_allreduce_intra_dec_fixed(ALLREDUCE_ARGS);
 int ompi_coll_tuned_allreduce_intra_disjoint_dec_fixed(ALLREDUCE_ARGS);
 int ompi_coll_tuned_allreduce_intra_dec_dynamic(ALLREDUCE_ARGS);
-int ompi_coll_tuned_allreduce_intra_do_this(ALLREDUCE_ARGS, int algorithm, int faninout, int segsize);
+int ompi_coll_tuned_allreduce_intra_do_this(ALLREDUCE_ARGS, int algorithm, int faninout, int segsize, ompi_op_gpu_session_t *session);
 int ompi_coll_tuned_allreduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
 
 /* AlltoAll */
@@ -146,43 +147,43 @@ int ompi_coll_tuned_bcast_intra_check_forced_init (coll_tuned_force_algorithm_mc
 /* Gather */
 int ompi_coll_tuned_gather_intra_dec_fixed(GATHER_ARGS);
 int ompi_coll_tuned_gather_intra_dec_dynamic(GATHER_ARGS);
-int ompi_coll_tuned_gather_intra_do_this(GATHER_ARGS, int algorithm, int faninout, int segsize);
+int ompi_coll_tuned_gather_intra_do_this(GATHER_ARGS, int algorithm, int faninout, int segsize, mca_allocator_base_module_t *allocator);
 int ompi_coll_tuned_gather_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
 
 /* Reduce */
 int ompi_coll_tuned_reduce_intra_dec_fixed(REDUCE_ARGS);
 int ompi_coll_tuned_reduce_intra_dec_dynamic(REDUCE_ARGS);
-int ompi_coll_tuned_reduce_intra_do_this(REDUCE_ARGS, int algorithm, int faninout, int segsize, int max_oustanding_reqs);
+int ompi_coll_tuned_reduce_intra_do_this(REDUCE_ARGS, int algorithm, int faninout, int segsize, int max_oustanding_reqs, ompi_op_gpu_session_t *session);
 int ompi_coll_tuned_reduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
 
 /* Reduce_scatter */
 int ompi_coll_tuned_reduce_scatter_intra_dec_fixed(REDUCESCATTER_ARGS);
 int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(REDUCESCATTER_ARGS);
-int ompi_coll_tuned_reduce_scatter_intra_do_this(REDUCESCATTER_ARGS, int algorithm, int faninout, int segsize);
+int ompi_coll_tuned_reduce_scatter_intra_do_this(REDUCESCATTER_ARGS, int algorithm, int faninout, int segsize, ompi_op_gpu_session_t *session);
 int ompi_coll_tuned_reduce_scatter_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
 
 /* Reduce_scatter_block */
 int ompi_coll_tuned_reduce_scatter_block_intra_dec_fixed(REDUCESCATTERBLOCK_ARGS);
 int ompi_coll_tuned_reduce_scatter_block_intra_dec_dynamic(REDUCESCATTERBLOCK_ARGS);
-int ompi_coll_tuned_reduce_scatter_block_intra_do_this(REDUCESCATTERBLOCK_ARGS, int algorithm, int faninout, int segsize);
+int ompi_coll_tuned_reduce_scatter_block_intra_do_this(REDUCESCATTERBLOCK_ARGS, int algorithm, int faninout, int segsize, ompi_op_gpu_session_t *session);
 int ompi_coll_tuned_reduce_scatter_block_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
 
 /* Scatter */
 int ompi_coll_tuned_scatter_intra_dec_fixed(SCATTER_ARGS);
 int ompi_coll_tuned_scatter_intra_dec_dynamic(SCATTER_ARGS);
-int ompi_coll_tuned_scatter_intra_do_this(SCATTER_ARGS, int algorithm, int faninout, int segsize);
+int ompi_coll_tuned_scatter_intra_do_this(SCATTER_ARGS, int algorithm, int faninout, int segsize, mca_allocator_base_module_t *allocator);
 int ompi_coll_tuned_scatter_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
 
 /* Exscan */
 int ompi_coll_tuned_exscan_intra_dec_fixed(EXSCAN_ARGS);
 int ompi_coll_tuned_exscan_intra_dec_dynamic(EXSCAN_ARGS);
-int ompi_coll_tuned_exscan_intra_do_this(EXSCAN_ARGS, int algorithm);
+int ompi_coll_tuned_exscan_intra_do_this(EXSCAN_ARGS, int algorithm, ompi_op_gpu_session_t *session);
 int ompi_coll_tuned_exscan_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
 
 /* Scan */
 int ompi_coll_tuned_scan_intra_dec_fixed(SCAN_ARGS);
 int ompi_coll_tuned_scan_intra_dec_dynamic(SCAN_ARGS);
-int ompi_coll_tuned_scan_intra_do_this(SCAN_ARGS, int algorithm);
+int ompi_coll_tuned_scan_intra_do_this(SCAN_ARGS, int algorithm, ompi_op_gpu_session_t *session);
 int ompi_coll_tuned_scan_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
 
 struct mca_coll_tuned_component_t {
diff --git a/ompi/mca/coll/tuned/coll_tuned_allgather_decision.c b/ompi/mca/coll/tuned/coll_tuned_allgather_decision.c
index 052c1d5f9e4..4e9d167f79a 100644
--- a/ompi/mca/coll/tuned/coll_tuned_allgather_decision.c
+++ b/ompi/mca/coll/tuned/coll_tuned_allgather_decision.c
@@ -133,7 +133,8 @@ int ompi_coll_tuned_allgather_intra_do_this(const void *sbuf, size_t scount,
                                             struct ompi_datatype_t *rdtype,
                                             struct ompi_communicator_t *comm,
                                             mca_coll_base_module_t *module,
-                                            int algorithm, int faninout, int segsize)
+                                            int algorithm, int faninout, int segsize,
+                                            mca_allocator_base_module_t *allocator)
 {
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
                  "coll:tuned:allgather_intra_do_this selected algorithm %d topo faninout %d segsize %d",
@@ -150,7 +151,7 @@ int ompi_coll_tuned_allgather_intra_do_this(const void *sbuf, size_t scount,
     case (2):
         return ompi_coll_base_allgather_intra_k_bruck(sbuf, scount, sdtype,
                                                       rbuf, rcount, rdtype,
-                                                      comm, module, faninout);
+                                                      comm, module, faninout, allocator);
     case (3):
         return ompi_coll_base_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
                                                                 rbuf, rcount, rdtype,
diff --git a/ompi/mca/coll/tuned/coll_tuned_allreduce_decision.c b/ompi/mca/coll/tuned/coll_tuned_allreduce_decision.c
index 9a63d8c5abb..113779c90b7 100644
--- a/ompi/mca/coll/tuned/coll_tuned_allreduce_decision.c
+++ b/ompi/mca/coll/tuned/coll_tuned_allreduce_decision.c
@@ -130,7 +130,8 @@ int ompi_coll_tuned_allreduce_intra_do_this(const void *sbuf, void *rbuf, size_t
                                             struct ompi_op_t *op,
                                             struct ompi_communicator_t *comm,
                                             mca_coll_base_module_t *module,
-                                            int algorithm, int faninout, int segsize)
+                                            int algorithm, int faninout, int segsize,
+                                            ompi_op_gpu_session_t *session)
 {
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:allreduce_intra_do_this algorithm %d topo fan in/out %d segsize %d",
@@ -144,15 +145,15 @@ int ompi_coll_tuned_allreduce_intra_do_this(const void *sbuf, void *rbuf, size_t
     case (2):
         return ompi_coll_base_allreduce_intra_nonoverlapping(sbuf, rbuf, count, dtype, op, comm, module);
     case (3):
-        return ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf, count, dtype, op, comm, module);
+        return ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf, count, dtype, op, comm, module, session);
     case (4):
-        return ompi_coll_base_allreduce_intra_ring(sbuf, rbuf, count, dtype, op, comm, module);
+        return ompi_coll_base_allreduce_intra_ring(sbuf, rbuf, count, dtype, op, comm, module, session);
     case (5):
-        return ompi_coll_base_allreduce_intra_ring_segmented(sbuf, rbuf, count, dtype, op, comm, module, segsize);
+        return ompi_coll_base_allreduce_intra_ring_segmented(sbuf, rbuf, count, dtype, op, comm, module, segsize, session);
     case (6):
-        return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype, op, comm, module);
+        return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype, op, comm, module, session);
     case (7):
-        return ompi_coll_base_allreduce_intra_allgather_reduce(sbuf, rbuf, count, dtype, op, comm, module);
+        return ompi_coll_base_allreduce_intra_allgather_reduce(sbuf, rbuf, count, dtype, op, comm, module, session);
     } /* switch */
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:allreduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
diff --git a/ompi/mca/coll/tuned/coll_tuned_decision_dynamic.c b/ompi/mca/coll/tuned/coll_tuned_decision_dynamic.c
index 5d6a699b301..17d6786d949 100644
--- a/ompi/mca/coll/tuned/coll_tuned_decision_dynamic.c
+++ b/ompi/mca/coll/tuned/coll_tuned_decision_dynamic.c
@@ -25,7 +25,11 @@
 
 #include "mpi.h"
 #include "ompi/constants.h"
+#include "opal/mca/accelerator/accelerator.h"
+#include "opal/mca/accelerator/base/base.h"
 #include "ompi/datatype/ompi_datatype.h"
+#include "ompi/op/op.h"
+#include "ompi/op/op_gpu_session.h"
 #include "ompi/communicator/communicator.h"
 #include "ompi/mca/coll/base/base.h"
 #include "ompi/mca/coll/coll.h"
@@ -65,10 +69,20 @@ ompi_coll_tuned_allreduce_intra_dec_dynamic (const void *sbuf, void *rbuf, size_
 
     /* Check first if an algorithm is set explicitly for this collective */
     if (tuned_module->user_forced[ALLREDUCE].algorithm) {
-        return ompi_coll_tuned_allreduce_intra_do_this(sbuf, rbuf, count, dtype, op, comm, module,
-                                                       tuned_module->user_forced[ALLREDUCE].algorithm,
-                                                       tuned_module->user_forced[ALLREDUCE].tree_fanout,
-                                                       tuned_module->user_forced[ALLREDUCE].segsize);
+        ompi_op_gpu_session_t *session = NULL;
+        int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+        uint64_t _flags;
+        if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+            opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+            session = ompi_op_gpu_session_begin(op, dtype, _dev_id);
+        }
+        int rc = ompi_coll_tuned_allreduce_intra_do_this(sbuf, rbuf, count, dtype, op, comm, module,
+                                                         tuned_module->user_forced[ALLREDUCE].algorithm,
+                                                         tuned_module->user_forced[ALLREDUCE].tree_fanout,
+                                                         tuned_module->user_forced[ALLREDUCE].segsize,
+                                                         session);
+        ompi_op_gpu_session_end(session);
+        return rc;
     }
 
     /* check to see if we have some filebased rules */
@@ -84,10 +98,18 @@ ompi_coll_tuned_allreduce_intra_dec_dynamic (const void *sbuf, void *rbuf, size_
                                                         dsize, &faninout, &segsize, &ignoreme);
 
         if (alg) {
-            /* we have found a valid choice from the file based rules for this message size */
-            return ompi_coll_tuned_allreduce_intra_do_this (sbuf, rbuf, count, dtype, op,
-                                                            comm, module,
-                                                            alg, faninout, segsize);
+            ompi_op_gpu_session_t *session = NULL;
+            int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+            uint64_t _flags;
+            if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+                opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+                session = ompi_op_gpu_session_begin(op, dtype, _dev_id);
+            }
+            int rc = ompi_coll_tuned_allreduce_intra_do_this(sbuf, rbuf, count, dtype, op,
+                                                             comm, module,
+                                                             alg, faninout, segsize, session);
+            ompi_op_gpu_session_end(session);
+            return rc;
         } /* found a method */
     } /*end if any com rules to check */
 
@@ -319,12 +341,22 @@ int ompi_coll_tuned_reduce_intra_dec_dynamic( const void *sbuf, void *rbuf,
 
     /* Check first if an algorithm is set explicitly for this collective */
     if (tuned_module->user_forced[REDUCE].algorithm) {
-        return ompi_coll_tuned_reduce_intra_do_this(sbuf, rbuf, count, dtype,
-                                                    op, root, comm, module,
-                                                    tuned_module->user_forced[REDUCE].algorithm,
-                                                    tuned_module->user_forced[REDUCE].chain_fanout,
-                                                    tuned_module->user_forced[REDUCE].segsize,
-                                                    tuned_module->user_forced[REDUCE].max_requests);
+        ompi_op_gpu_session_t *session = NULL;
+        int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+        uint64_t _flags;
+        if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+            opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+            session = ompi_op_gpu_session_begin(op, dtype, _dev_id);
+        }
+        int rc = ompi_coll_tuned_reduce_intra_do_this(sbuf, rbuf, count, dtype,
+                                                      op, root, comm, module,
+                                                      tuned_module->user_forced[REDUCE].algorithm,
+                                                      tuned_module->user_forced[REDUCE].chain_fanout,
+                                                      tuned_module->user_forced[REDUCE].segsize,
+                                                      tuned_module->user_forced[REDUCE].max_requests,
+                                                      session);
+        ompi_op_gpu_session_end(session);
+        return rc;
     }
 
     /* check to see if we have some filebased rules */
@@ -341,11 +373,19 @@ int ompi_coll_tuned_reduce_intra_dec_dynamic( const void *sbuf, void *rbuf,
                                                         dsize, &faninout, &segsize, &max_requests);
 
         if (alg) {
-            /* we have found a valid choice from the file based rules for this message size */
-            return  ompi_coll_tuned_reduce_intra_do_this (sbuf, rbuf, count, dtype,
+            ompi_op_gpu_session_t *session = NULL;
+            int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+            uint64_t _flags;
+            if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+                opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+                session = ompi_op_gpu_session_begin(op, dtype, _dev_id);
+            }
+            int rc = ompi_coll_tuned_reduce_intra_do_this(sbuf, rbuf, count, dtype,
                                                           op, root, comm, module,
                                                           alg, faninout,
-                                                          segsize, max_requests);
+                                                          segsize, max_requests, session);
+            ompi_op_gpu_session_end(session);
+            return rc;
         } /* found a method */
     } /*end if any com rules to check */
 
@@ -376,11 +416,21 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(const void *sbuf, void *rbu
 
     /* Check first if an algorithm is set explicitly for this collective */
     if (tuned_module->user_forced[REDUCESCATTER].algorithm) {
-        return ompi_coll_tuned_reduce_scatter_intra_do_this(sbuf, rbuf, rcounts, dtype,
-                                                            op, comm, module,
-                                                            tuned_module->user_forced[REDUCESCATTER].algorithm,
-                                                            tuned_module->user_forced[REDUCESCATTER].chain_fanout,
-                                                            tuned_module->user_forced[REDUCESCATTER].segsize);
+        ompi_op_gpu_session_t *session = NULL;
+        int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+        uint64_t _flags;
+        if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+            opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+            session = ompi_op_gpu_session_begin(op, dtype, _dev_id);
+        }
+        int rc = ompi_coll_tuned_reduce_scatter_intra_do_this(sbuf, rbuf, rcounts, dtype,
+                                                              op, comm, module,
+                                                              tuned_module->user_forced[REDUCESCATTER].algorithm,
+                                                              tuned_module->user_forced[REDUCESCATTER].chain_fanout,
+                                                              tuned_module->user_forced[REDUCESCATTER].segsize,
+                                                              session);
+        ompi_op_gpu_session_end(session);
+        return rc;
     }
 
     /* check to see if we have some filebased rules */
@@ -398,10 +448,18 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(const void *sbuf, void *rbu
                                                         dsize, &faninout,
                                                         &segsize, &ignoreme);
         if (alg) {
-            /* we have found a valid choice from the file based rules for this message size */
-            return  ompi_coll_tuned_reduce_scatter_intra_do_this (sbuf, rbuf, rcounts, dtype,
+            ompi_op_gpu_session_t *session = NULL;
+            int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+            uint64_t _flags;
+            if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+                opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+                session = ompi_op_gpu_session_begin(op, dtype, _dev_id);
+            }
+            int rc = ompi_coll_tuned_reduce_scatter_intra_do_this(sbuf, rbuf, rcounts, dtype,
                                                                   op, comm, module,
-                                                                  alg, faninout, segsize);
+                                                                  alg, faninout, segsize, session);
+            ompi_op_gpu_session_end(session);
+            return rc;
         } /* found a method */
     } /*end if any com rules to check */
 
@@ -432,11 +490,21 @@ int ompi_coll_tuned_reduce_scatter_block_intra_dec_dynamic(const void *sbuf, voi
 
     /* Check first if an algorithm is set explicitly for this collective */
     if (tuned_module->user_forced[REDUCESCATTERBLOCK].algorithm) {
-        return ompi_coll_tuned_reduce_scatter_block_intra_do_this(sbuf, rbuf, rcount, dtype,
-                                                                  op, comm, module,
-                                                                  tuned_module->user_forced[REDUCESCATTERBLOCK].algorithm,
-                                                                  tuned_module->user_forced[REDUCESCATTERBLOCK].chain_fanout,
-                                                                  tuned_module->user_forced[REDUCESCATTERBLOCK].segsize);
+        ompi_op_gpu_session_t *session = NULL;
+        int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+        uint64_t _flags;
+        if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+            opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+            session = ompi_op_gpu_session_begin(op, dtype, _dev_id);
+        }
+        int rc = ompi_coll_tuned_reduce_scatter_block_intra_do_this(sbuf, rbuf, rcount, dtype,
+                                                                    op, comm, module,
+                                                                    tuned_module->user_forced[REDUCESCATTERBLOCK].algorithm,
+                                                                    tuned_module->user_forced[REDUCESCATTERBLOCK].chain_fanout,
+                                                                    tuned_module->user_forced[REDUCESCATTERBLOCK].segsize,
+                                                                    session);
+        ompi_op_gpu_session_end(session);
+        return rc;
     }
 
     /* check to see if we have some filebased rules */
@@ -453,10 +521,18 @@ int ompi_coll_tuned_reduce_scatter_block_intra_dec_dynamic(const void *sbuf, voi
                                                        dsize, &faninout,
                                                        &segsize, &ignoreme);
         if (alg) {
-            /* we have found a valid choice from the file based rules for this message size */
-            return  ompi_coll_tuned_reduce_scatter_block_intra_do_this (sbuf, rbuf, rcount, dtype,
+            ompi_op_gpu_session_t *session = NULL;
+            int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+            uint64_t _flags;
+            if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+                opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+                session = ompi_op_gpu_session_begin(op, dtype, _dev_id);
+            }
+            int rc = ompi_coll_tuned_reduce_scatter_block_intra_do_this(sbuf, rbuf, rcount, dtype,
                                                                         op, comm, module,
-                                                                        alg, faninout, segsize);
+                                                                        alg, faninout, segsize, session);
+            ompi_op_gpu_session_end(session);
+            return rc;
         } /* found a method */
     } /* end if any com rules to check */
 
@@ -487,13 +563,21 @@ int ompi_coll_tuned_allgather_intra_dec_dynamic(const void *sbuf, size_t scount,
 
     /* Check first if an algorithm is set explicitly for this collective */
     if (tuned_module->user_forced[ALLGATHER].algorithm) {
-        /* User-forced algorithm */
+        mca_allocator_base_module_t *allocator = NULL;
+        int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+        uint64_t _flags;
+        if ((sbuf != MPI_IN_PLACE &&
+             opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+            opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+            allocator = opal_accelerator_base_get_device_allocator(_dev_id);
+        }
         return ompi_coll_tuned_allgather_intra_do_this(sbuf, scount, sdtype,
                                                        rbuf, rcount, rdtype,
                                                        comm, module,
                                                        tuned_module->user_forced[ALLGATHER].algorithm,
                                                        tuned_module->user_forced[ALLGATHER].tree_fanout,
-                                                       tuned_module->user_forced[ALLGATHER].segsize);
+                                                       tuned_module->user_forced[ALLGATHER].segsize,
+                                                       allocator);
     }
 
     if (tuned_module->com_rules[ALLGATHER]) {
@@ -510,12 +594,18 @@ int ompi_coll_tuned_allgather_intra_dec_dynamic(const void *sbuf, size_t scount,
         alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[ALLGATHER],
                                                         dsize, &faninout, &segsize, &ignoreme);
         if (alg) {
-            /* we have found a valid choice from the file based rules for
-               this message size */
-            return ompi_coll_tuned_allgather_intra_do_this (sbuf, scount, sdtype,
-                                                            rbuf, rcount, rdtype,
-                                                            comm, module,
-                                                            alg, faninout, segsize);
+            mca_allocator_base_module_t *allocator = NULL;
+            int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+            uint64_t _flags;
+            if ((sbuf != MPI_IN_PLACE &&
+                 opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+                opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+                allocator = opal_accelerator_base_get_device_allocator(_dev_id);
+            }
+            return ompi_coll_tuned_allgather_intra_do_this(sbuf, scount, sdtype,
+                                                           rbuf, rcount, rdtype,
+                                                           comm, module,
+                                                           alg, faninout, segsize, allocator);
         }
     }
 
@@ -604,6 +694,20 @@ int ompi_coll_tuned_gather_intra_dec_dynamic(const void *sbuf, size_t scount,
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
                  "ompi_coll_tuned_gather_intra_dec_dynamic"));
 
+    mca_allocator_base_module_t *allocator = NULL;
+
+    /* Scratch buffer is used for data movement only (no ompi_op_reduce).
+     * Use device allocator when user buffers are on device. */
+    {
+        int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+        uint64_t _flags;
+        if ((sbuf != MPI_IN_PLACE &&
+             opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+            opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+            allocator = opal_accelerator_base_get_device_allocator(_dev_id);
+        }
+    }
+
     /* Check first if an algorithm is set explicitly for this collective */
     if (tuned_module->user_forced[GATHER].algorithm) {
         return ompi_coll_tuned_gather_intra_do_this(sbuf, scount, sdtype,
@@ -611,7 +715,8 @@ int ompi_coll_tuned_gather_intra_dec_dynamic(const void *sbuf, size_t scount,
                                                     root, comm, module,
                                                     tuned_module->user_forced[GATHER].algorithm,
                                                     tuned_module->user_forced[GATHER].tree_fanout,
-                                                    tuned_module->user_forced[GATHER].segsize);
+                                                    tuned_module->user_forced[GATHER].segsize,
+                                                    allocator);
     }
 
     /**
@@ -629,11 +734,10 @@ int ompi_coll_tuned_gather_intra_dec_dynamic(const void *sbuf, size_t scount,
                                                         dsize, &faninout, &segsize, &max_requests);
 
         if (alg) {
-            /* we have found a valid choice from the file based rules for this message size */
-            return ompi_coll_tuned_gather_intra_do_this (sbuf, scount, sdtype,
-                                                         rbuf, rcount, rdtype,
-                                                         root, comm, module,
-                                                         alg, faninout, segsize);
+            return ompi_coll_tuned_gather_intra_do_this(sbuf, scount, sdtype,
+                                                        rbuf, rcount, rdtype,
+                                                        root, comm, module,
+                                                        alg, faninout, segsize, allocator);
         } /* found a method */
     } /*end if any com rules to check */
 
@@ -654,6 +758,20 @@ int ompi_coll_tuned_scatter_intra_dec_dynamic(const void *sbuf, size_t scount,
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
                  "ompi_coll_tuned_scatter_intra_dec_dynamic"));
 
+    mca_allocator_base_module_t *allocator = NULL;
+
+    /* Scratch buffer is used for data movement only (no ompi_op_reduce).
+     * Use device allocator when user buffers are on device. */
+    {
+        int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+        uint64_t _flags;
+        if ((sbuf != MPI_IN_PLACE &&
+             opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+            opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+            allocator = opal_accelerator_base_get_device_allocator(_dev_id);
+        }
+    }
+
     /* Check first if an algorithm is set explicitly for this collective */
     if (tuned_module->user_forced[SCATTER].algorithm) {
         return ompi_coll_tuned_scatter_intra_do_this(sbuf, scount, sdtype,
@@ -661,7 +779,8 @@ int ompi_coll_tuned_scatter_intra_dec_dynamic(const void *sbuf, size_t scount,
                                                      root, comm, module,
                                                      tuned_module->user_forced[SCATTER].algorithm,
                                                      tuned_module->user_forced[SCATTER].chain_fanout,
-                                                     tuned_module->user_forced[SCATTER].segsize);
+                                                     tuned_module->user_forced[SCATTER].segsize,
+                                                     allocator);
     }
 
     /**
@@ -679,11 +798,10 @@ int ompi_coll_tuned_scatter_intra_dec_dynamic(const void *sbuf, size_t scount,
                                                         dsize, &faninout, &segsize, &max_requests);
 
         if (alg) {
-            /* we have found a valid choice from the file based rules for this message size */
-            return ompi_coll_tuned_scatter_intra_do_this (sbuf, scount, sdtype,
-                                                          rbuf, rcount, rdtype,
-                                                          root, comm, module,
-                                                          alg, faninout, segsize);
+            return ompi_coll_tuned_scatter_intra_do_this(sbuf, scount, sdtype,
+                                                         rbuf, rcount, rdtype,
+                                                         root, comm, module,
+                                                         alg, faninout, segsize, allocator);
         } /* found a method */
     } /*end if any com rules to check */
 
@@ -705,9 +823,19 @@ int ompi_coll_tuned_exscan_intra_dec_dynamic(const void *sbuf, void* rbuf, size_
 
     /* Check first if an algorithm is set explicitly for this collective */
     if (tuned_module->user_forced[EXSCAN].algorithm) {
-        return ompi_coll_tuned_exscan_intra_do_this(sbuf, rbuf, count, dtype,
-                                                    op, comm, module,
-                                                    tuned_module->user_forced[EXSCAN].algorithm);
+        ompi_op_gpu_session_t *session = NULL;
+        int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+        uint64_t _flags;
+        if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+            opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+            session = ompi_op_gpu_session_begin(op, dtype, _dev_id);
+        }
+        int rc = ompi_coll_tuned_exscan_intra_do_this(sbuf, rbuf, count, dtype,
+                                                      op, comm, module,
+                                                      tuned_module->user_forced[EXSCAN].algorithm,
+                                                      session);
+        ompi_op_gpu_session_end(session);
+        return rc;
     }
 
     /**
@@ -725,10 +853,18 @@ int ompi_coll_tuned_exscan_intra_dec_dynamic(const void *sbuf, void* rbuf, size_
                                                         dsize, &faninout, &segsize, &max_requests);
 
         if (alg) {
-            /* we have found a valid choice from the file based rules for this message size */
-            return ompi_coll_tuned_exscan_intra_do_this (sbuf, rbuf, count, dtype,
-                                                         op, comm, module,
-                                                         alg);
+            ompi_op_gpu_session_t *session = NULL;
+            int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+            uint64_t _flags;
+            if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+                opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+                session = ompi_op_gpu_session_begin(op, dtype, _dev_id);
+            }
+            int rc = ompi_coll_tuned_exscan_intra_do_this(sbuf, rbuf, count, dtype,
+                                                          op, comm, module,
+                                                          alg, session);
+            ompi_op_gpu_session_end(session);
+            return rc;
         } /* found a method */
     } /*end if any com rules to check */
 
@@ -749,9 +885,19 @@ int ompi_coll_tuned_scan_intra_dec_dynamic(const void *sbuf, void* rbuf, size_t
 
     /* Check first if an algorithm is set explicitly for this collective */
     if (tuned_module->user_forced[SCAN].algorithm) {
-        return ompi_coll_tuned_scan_intra_do_this(sbuf, rbuf, count, dtype,
-                                                  op, comm, module,
-                                                  tuned_module->user_forced[SCAN].algorithm);
+        ompi_op_gpu_session_t *session = NULL;
+        int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+        uint64_t _flags;
+        if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+            opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+            session = ompi_op_gpu_session_begin(op, dtype, _dev_id);
+        }
+        int rc = ompi_coll_tuned_scan_intra_do_this(sbuf, rbuf, count, dtype,
+                                                    op, comm, module,
+                                                    tuned_module->user_forced[SCAN].algorithm,
+                                                    session);
+        ompi_op_gpu_session_end(session);
+        return rc;
     }
 
     /**
@@ -769,10 +915,18 @@ int ompi_coll_tuned_scan_intra_dec_dynamic(const void *sbuf, void* rbuf, size_t
                                                         dsize, &faninout, &segsize, &max_requests);
 
         if (alg) {
-            /* we have found a valid choice from the file based rules for this message size */
-            return ompi_coll_tuned_scan_intra_do_this (sbuf, rbuf, count, dtype,
-                                                       op, comm, module,
-                                                       alg);
+            ompi_op_gpu_session_t *session = NULL;
+            int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+            uint64_t _flags;
+            if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+                opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+                session = ompi_op_gpu_session_begin(op, dtype, _dev_id);
+            }
+            int rc = ompi_coll_tuned_scan_intra_do_this(sbuf, rbuf, count, dtype,
+                                                        op, comm, module,
+                                                        alg, session);
+            ompi_op_gpu_session_end(session);
+            return rc;
         } /* found a method */
     } /*end if any com rules to check */
 
diff --git a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c
index 3b0077c9bcc..c8299c68a66 100644
--- a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c
+++ b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c
@@ -29,11 +29,14 @@
 
 #include "mpi.h"
 #include "opal/util/bit_ops.h"
+#include "opal/mca/accelerator/accelerator.h"
+#include "opal/mca/accelerator/base/base.h"
 #include "ompi/datatype/ompi_datatype.h"
 #include "ompi/communicator/communicator.h"
 #include "ompi/mca/coll/coll.h"
 #include "ompi/mca/coll/base/coll_tags.h"
 #include "ompi/op/op.h"
+#include "ompi/op/op_gpu_session.h"
 #include "coll_tuned.h"
 
 /*
@@ -214,8 +217,19 @@ ompi_coll_tuned_allreduce_intra_dec_fixed(const void *sbuf, void *rbuf, size_t c
         }
     }
 
-    return ompi_coll_tuned_allreduce_intra_do_this (sbuf, rbuf, count, dtype, op,
-                                                    comm, module, alg, 0, 0);
+    {
+        ompi_op_gpu_session_t *session = NULL;
+        int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+        uint64_t _flags;
+        if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+            opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+            session = ompi_op_gpu_session_begin(op, dtype, _dev_id);
+        }
+        int rc = ompi_coll_tuned_allreduce_intra_do_this(sbuf, rbuf, count, dtype, op,
+                                                         comm, module, alg, 0, 0, session);
+        ompi_op_gpu_session_end(session);
+        return rc;
+    }
 }
 
 
@@ -402,7 +416,7 @@ ompi_coll_tuned_allreduce_intra_disjoint_dec_fixed(const void *sbuf, void *rbuf,
     }
 
     return ompi_coll_tuned_allreduce_intra_do_this (sbuf, rbuf, count, dtype, op,
-                                                    comm, module, alg, 0, 0);
+                                                    comm, module, alg, 0, 0, NULL);
 }
 
                            
@@ -1073,10 +1087,21 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( const void *sendbuf, void *recvbuf,
         }
     }
 
-    int faninout = 2;
-    return  ompi_coll_tuned_reduce_intra_do_this (sendbuf, recvbuf, count, datatype,
-                                                  op, root, comm, module,
-                                                  alg, faninout, 0, 0);
+    {
+        ompi_op_gpu_session_t *session = NULL;
+        int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+        uint64_t _flags;
+        if ((sendbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sendbuf, &_dev_id, &_flags) > 0) ||
+            opal_accelerator.check_addr(recvbuf, &_dev_id, &_flags) > 0) {
+            session = ompi_op_gpu_session_begin(op, datatype, _dev_id);
+        }
+        int faninout = 2;
+        int rc = ompi_coll_tuned_reduce_intra_do_this(sendbuf, recvbuf, count, datatype,
+                                                      op, root, comm, module,
+                                                      alg, faninout, 0, 0, session);
+        ompi_op_gpu_session_end(session);
+        return rc;
+    }
 }
 
 /*
@@ -1223,9 +1248,20 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_fixed( const void *sbuf, void *rbuf
         }
     }
 
-    return  ompi_coll_tuned_reduce_scatter_intra_do_this (sbuf, rbuf, rcounts, dtype,
-                                                          op, comm, module,
-                                                          alg, 0, 0);
+    {
+        ompi_op_gpu_session_t *session = NULL;
+        int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+        uint64_t _flags;
+        if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+            opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+            session = ompi_op_gpu_session_begin(op, dtype, _dev_id);
+        }
+        int rc = ompi_coll_tuned_reduce_scatter_intra_do_this(sbuf, rbuf, rcounts, dtype,
+                                                              op, comm, module,
+                                                              alg, 0, 0, session);
+        ompi_op_gpu_session_end(session);
+        return rc;
+    }
 }
 
 /*
@@ -1344,9 +1380,20 @@ int ompi_coll_tuned_reduce_scatter_block_intra_dec_fixed(const void *sbuf, void
         }
     }
 
-    return  ompi_coll_tuned_reduce_scatter_block_intra_do_this (sbuf, rbuf, rcount, dtype,
-                                                                op, comm, module,
-                                                                alg, 0, 0);
+    {
+        ompi_op_gpu_session_t *session = NULL;
+        int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+        uint64_t _flags;
+        if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+            opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+            session = ompi_op_gpu_session_begin(op, dtype, _dev_id);
+        }
+        int rc = ompi_coll_tuned_reduce_scatter_block_intra_do_this(sbuf, rbuf, rcount, dtype,
+                                                                    op, comm, module,
+                                                                    alg, 0, 0, session);
+        ompi_op_gpu_session_end(session);
+        return rc;
+    }
 }
 
 /*
@@ -1491,10 +1538,21 @@ int ompi_coll_tuned_allgather_intra_dec_fixed(const void *sbuf, size_t scount,
         "ompi_coll_tuned_allgather_intra_dec_fixed rank %d com_size %d",
         ompi_comm_rank(comm), communicator_size));
 
-    int faninout = 2;
-    return ompi_coll_tuned_allgather_intra_do_this(sbuf, scount, sdtype,
-                                                   rbuf, rcount, rdtype,
-                                                   comm, module, alg, faninout, 0);
+    {
+        mca_allocator_base_module_t *allocator = NULL;
+        int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+        uint64_t _flags;
+        if ((sbuf != MPI_IN_PLACE &&
+             opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+            opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+            allocator = opal_accelerator_base_get_device_allocator(_dev_id);
+        }
+        int faninout = 2;
+        return ompi_coll_tuned_allgather_intra_do_this(sbuf, scount, sdtype,
+                                                       rbuf, rcount, rdtype,
+                                                       comm, module, alg, faninout, 0,
+                                                       allocator);
+    }
 }
 
 /*
@@ -1720,10 +1778,20 @@ int ompi_coll_tuned_gather_intra_dec_fixed(const void *sbuf, size_t scount,
         alg = 2;
     }
 
-    return ompi_coll_tuned_gather_intra_do_this (sbuf, scount, sdtype,
-                                                 rbuf, rcount, rdtype,
-                                                 root, comm, module,
-                                                 alg, 0, 0);
+    {
+        mca_allocator_base_module_t *allocator = NULL;
+        int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+        uint64_t _flags;
+        if ((sbuf != MPI_IN_PLACE &&
+             opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+            opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+            allocator = opal_accelerator_base_get_device_allocator(_dev_id);
+        }
+        return ompi_coll_tuned_gather_intra_do_this(sbuf, scount, sdtype,
+                                                    rbuf, rcount, rdtype,
+                                                    root, comm, module,
+                                                    alg, 0, 0, allocator);
+    }
 }
 
 /*
@@ -1825,8 +1893,18 @@ int ompi_coll_tuned_scatter_intra_dec_fixed(const void *sbuf, size_t scount,
         }
     }
 
-    return ompi_coll_tuned_scatter_intra_do_this (sbuf, scount, sdtype,
-                                                  rbuf, rcount, rdtype,
-                                                  root, comm, module,
-                                                  alg, 0, 0);
+    {
+        mca_allocator_base_module_t *allocator = NULL;
+        int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+        uint64_t _flags;
+        if ((sbuf != MPI_IN_PLACE &&
+             opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+            opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+            allocator = opal_accelerator_base_get_device_allocator(_dev_id);
+        }
+        return ompi_coll_tuned_scatter_intra_do_this(sbuf, scount, sdtype,
+                                                     rbuf, rcount, rdtype,
+                                                     root, comm, module,
+                                                     alg, 0, 0, allocator);
+    }
 }
diff --git a/ompi/mca/coll/tuned/coll_tuned_exscan_decision.c b/ompi/mca/coll/tuned/coll_tuned_exscan_decision.c
index 48288c5d7d7..50f16dc27cd 100644
--- a/ompi/mca/coll/tuned/coll_tuned_exscan_decision.c
+++ b/ompi/mca/coll/tuned/coll_tuned_exscan_decision.c
@@ -93,7 +93,7 @@ int ompi_coll_tuned_exscan_intra_do_this(const void *sbuf, void* rbuf, size_t co
                                          struct ompi_op_t *op,
                                          struct ompi_communicator_t *comm,
                                          mca_coll_base_module_t *module,
-                                         int algorithm)
+                                         int algorithm, ompi_op_gpu_session_t *session)
 {
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:exscan_intra_do_this selected algorithm %d",
@@ -104,7 +104,7 @@ int ompi_coll_tuned_exscan_intra_do_this(const void *sbuf, void* rbuf, size_t co
     case (1):  return ompi_coll_base_exscan_intra_linear(sbuf, rbuf, count, dtype,
                                                          op, comm, module);
     case (2):  return ompi_coll_base_exscan_intra_recursivedoubling(sbuf, rbuf, count, dtype,
-                                                                    op, comm, module);
+                                                                    op, comm, module, session);
     } /* switch */
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:exscan_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
diff --git a/ompi/mca/coll/tuned/coll_tuned_gather_decision.c b/ompi/mca/coll/tuned/coll_tuned_gather_decision.c
index d356202a3bf..1845f32a00b 100644
--- a/ompi/mca/coll/tuned/coll_tuned_gather_decision.c
+++ b/ompi/mca/coll/tuned/coll_tuned_gather_decision.c
@@ -130,7 +130,8 @@ ompi_coll_tuned_gather_intra_do_this(const void *sbuf, size_t scount,
                                      int root,
                                      struct ompi_communicator_t *comm,
                                      mca_coll_base_module_t *module,
-                                     int algorithm, int faninout, int segsize)
+                                     int algorithm, int faninout, int segsize,
+                                     mca_allocator_base_module_t *allocator)
 {
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
                  "coll:tuned:gather_intra_do_this selected algorithm %d topo faninout %d segsize %d",
@@ -148,7 +149,7 @@ ompi_coll_tuned_gather_intra_do_this(const void *sbuf, size_t scount,
     case (2):
         return ompi_coll_base_gather_intra_binomial(sbuf, scount, sdtype,
                                                     rbuf, rcount, rdtype,
-                                                    root, comm, module);
+                                                    root, comm, module, allocator);
     case (3):
         return ompi_coll_base_gather_intra_linear_sync(sbuf, scount, sdtype,
                                                        rbuf, rcount, rdtype,
diff --git a/ompi/mca/coll/tuned/coll_tuned_reduce_decision.c b/ompi/mca/coll/tuned/coll_tuned_reduce_decision.c
index 6ae3c00f7d9..f935a680116 100644
--- a/ompi/mca/coll/tuned/coll_tuned_reduce_decision.c
+++ b/ompi/mca/coll/tuned/coll_tuned_reduce_decision.c
@@ -154,7 +154,8 @@ int ompi_coll_tuned_reduce_intra_do_this(const void *sbuf, void* rbuf, size_t co
                                          struct ompi_communicator_t *comm,
                                          mca_coll_base_module_t *module,
                                          int algorithm, int faninout,
-                                         int segsize, int max_requests )
+                                         int segsize, int max_requests,
+                                         ompi_op_gpu_session_t *session)
 {
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:reduce_intra_do_this selected algorithm %d topo faninout %d segsize %d",
@@ -167,25 +168,31 @@ int ompi_coll_tuned_reduce_intra_do_this(const void *sbuf, void* rbuf, size_t co
                                                                op, root, comm, module);
     case (2):  return ompi_coll_base_reduce_intra_chain(sbuf, rbuf, count, dtype,
                                                         op, root, comm, module,
-                                                        segsize, faninout, max_requests);
+                                                        segsize, faninout, max_requests,
+                                                        session);
     case (3):  return ompi_coll_base_reduce_intra_pipeline(sbuf, rbuf, count, dtype,
                                                            op, root, comm, module,
-                                                           segsize, max_requests);
+                                                           segsize, max_requests,
+                                                           session);
     case (4):  return ompi_coll_base_reduce_intra_binary(sbuf, rbuf, count, dtype,
                                                          op, root, comm, module,
-                                                         segsize, max_requests);
+                                                         segsize, max_requests,
+                                                         session);
     case (5):  return ompi_coll_base_reduce_intra_binomial(sbuf, rbuf, count, dtype,
                                                            op, root, comm, module,
-                                                           segsize, max_requests);
+                                                           segsize, max_requests,
+                                                           session);
     case (6):  return ompi_coll_base_reduce_intra_in_order_binary(sbuf, rbuf, count, dtype,
                                                                   op, root, comm, module,
-                                                                  segsize, max_requests);
+                                                                  segsize, max_requests,
+                                                                  session);
     case (7):  return ompi_coll_base_reduce_intra_redscat_gather(sbuf, rbuf, count, dtype,
-                                                                  op, root, comm, module);
+                                                                  op, root, comm, module,
+                                                                  session);
     case (8):  return ompi_coll_base_reduce_intra_knomial(sbuf, rbuf, count, dtype,
                                                           op, root, comm, module,
                                                           segsize, max_requests,
-                                                          faninout);
+                                                          faninout, session);
     } /* switch */
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:reduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
diff --git a/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_block_decision.c b/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_block_decision.c
index f4f6bdb7590..7a05ce9ee72 100644
--- a/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_block_decision.c
+++ b/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_block_decision.c
@@ -123,7 +123,8 @@ int ompi_coll_tuned_reduce_scatter_block_intra_do_this(const void *sbuf, void *r
                                                        struct ompi_op_t *op,
                                                        struct ompi_communicator_t *comm,
                                                        mca_coll_base_module_t *module,
-                                                       int algorithm, int faninout, int segsize)
+                                                       int algorithm, int faninout, int segsize,
+                                                       ompi_op_gpu_session_t *session)
 {
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:reduce_scatter_block_intra_do_this selected algorithm %d topo faninout %d segsize %d",
@@ -133,13 +134,16 @@ int ompi_coll_tuned_reduce_scatter_block_intra_do_this(const void *sbuf, void *r
     case (0): return ompi_coll_tuned_reduce_scatter_block_intra_dec_fixed(sbuf, rbuf, rcount,
                                                                           dtype, op, comm, module);
     case (1): return ompi_coll_base_reduce_scatter_block_basic_linear(sbuf, rbuf, rcount,
-                                                                      dtype, op, comm, module);
+                                                                      dtype, op, comm, module,
+                                                                      session);
     case (2): return ompi_coll_base_reduce_scatter_block_intra_recursivedoubling(sbuf, rbuf, rcount,
-                                                                                 dtype, op, comm, module);
+                                                                                 dtype, op, comm, module,
+                                                                                 session);
     case (3): return ompi_coll_base_reduce_scatter_block_intra_recursivehalving(sbuf, rbuf, rcount,
-                                                                                dtype, op, comm, module);
+                                                                                dtype, op, comm, module,
+                                                                                session);
     case (4): return ompi_coll_base_reduce_scatter_block_intra_butterfly(sbuf, rbuf, rcount, dtype, op, comm,
-                                                                         module);
+                                                                         module, session);
     } /* switch */
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:reduce_scatter_block_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
diff --git a/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_decision.c b/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_decision.c
index 16747598b6e..5c79333e567 100644
--- a/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_decision.c
+++ b/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_decision.c
@@ -130,7 +130,8 @@ int ompi_coll_tuned_reduce_scatter_intra_do_this(const void *sbuf, void* rbuf,
                                                  struct ompi_op_t *op,
                                                  struct ompi_communicator_t *comm,
                                                  mca_coll_base_module_t *module,
-                                                 int algorithm, int faninout, int segsize)
+                                                 int algorithm, int faninout, int segsize,
+                                                 ompi_op_gpu_session_t *session)
 {
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:reduce_scatter_intra_do_this selected algorithm %d topo faninout %d segsize %d",
@@ -140,13 +141,17 @@ int ompi_coll_tuned_reduce_scatter_intra_do_this(const void *sbuf, void* rbuf,
     case (0): return ompi_coll_tuned_reduce_scatter_intra_dec_fixed(sbuf, rbuf, rcounts,
                                                                     dtype, op, comm, module);
     case (1): return ompi_coll_base_reduce_scatter_intra_nonoverlapping(sbuf, rbuf, rcounts,
-                                                                        dtype, op, comm, module);
+                                                                        dtype, op, comm, module,
+                                                                        session);
     case (2): return ompi_coll_base_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
-                                                                                dtype, op, comm, module);
+                                                                                dtype, op, comm, module,
+                                                                                session);
     case (3): return ompi_coll_base_reduce_scatter_intra_ring(sbuf, rbuf, rcounts,
-                                                              dtype, op, comm, module);
+                                                              dtype, op, comm, module,
+                                                              session);
     case (4): return ompi_coll_base_reduce_scatter_intra_butterfly(sbuf, rbuf, rcounts,
-                                                                   dtype, op, comm, module);
+                                                                   dtype, op, comm, module,
+                                                                   session);
     } /* switch */
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:reduce_scatter_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
diff --git a/ompi/mca/coll/tuned/coll_tuned_scan_decision.c b/ompi/mca/coll/tuned/coll_tuned_scan_decision.c
index 903e76c4694..e16a2376c65 100644
--- a/ompi/mca/coll/tuned/coll_tuned_scan_decision.c
+++ b/ompi/mca/coll/tuned/coll_tuned_scan_decision.c
@@ -93,7 +93,7 @@ int ompi_coll_tuned_scan_intra_do_this(const void *sbuf, void* rbuf, size_t coun
                                          struct ompi_op_t *op,
                                          struct ompi_communicator_t *comm,
                                          mca_coll_base_module_t *module,
-                                         int algorithm)
+                                         int algorithm, ompi_op_gpu_session_t *session)
 {
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:scan_intra_do_this selected algorithm %d",
@@ -104,7 +104,7 @@ int ompi_coll_tuned_scan_intra_do_this(const void *sbuf, void* rbuf, size_t coun
     case (1):  return ompi_coll_base_scan_intra_linear(sbuf, rbuf, count, dtype,
                                                        op, comm, module);
     case (2):  return ompi_coll_base_scan_intra_recursivedoubling(sbuf, rbuf, count, dtype,
-                                                                  op, comm, module);
+                                                                  op, comm, module, session);
     } /* switch */
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:scan_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
diff --git a/ompi/mca/coll/tuned/coll_tuned_scatter_decision.c b/ompi/mca/coll/tuned/coll_tuned_scatter_decision.c
index b1449b2955c..89a78f6fab6 100644
--- a/ompi/mca/coll/tuned/coll_tuned_scatter_decision.c
+++ b/ompi/mca/coll/tuned/coll_tuned_scatter_decision.c
@@ -163,7 +163,8 @@ ompi_coll_tuned_scatter_intra_do_this(const void *sbuf, size_t scount,
                                       int root,
                                       struct ompi_communicator_t *comm,
                                       mca_coll_base_module_t *module,
-                                      int algorithm, int faninout, int segsize)
+                                      int algorithm, int faninout, int segsize,
+                                      mca_allocator_base_module_t *allocator)
 {
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
                  "coll:tuned:scatter_intra_do_this selected algorithm %d topo faninout %d segsize %d",
@@ -181,7 +182,7 @@ ompi_coll_tuned_scatter_intra_do_this(const void *sbuf, size_t scount,
     case (2):
         return ompi_coll_base_scatter_intra_binomial(sbuf, scount, sdtype,
                                                      rbuf, rcount, rdtype,
-                                                     root, comm, module);
+                                                     root, comm, module, allocator);
     case (3):
         return ompi_coll_base_scatter_intra_linear_nb(sbuf, scount, sdtype,
                                                       rbuf, rcount, rdtype,
diff --git a/ompi/mca/op/base/op_base_frame.c b/ompi/mca/op/base/op_base_frame.c
index 90167300851..37052782b32 100644
--- a/ompi/mca/op/base/op_base_frame.c
+++ b/ompi/mca/op/base/op_base_frame.c
@@ -29,6 +29,7 @@
 #include "ompi/constants.h"
 #include "ompi/mca/op/op.h"
 #include "ompi/mca/op/base/base.h"
+#include "ompi/op/op_gpu_session.h"
 
 
 /*
@@ -59,5 +60,19 @@ OBJ_CLASS_INSTANCE(ompi_op_base_module_t, opal_object_t,
 OBJ_CLASS_INSTANCE(ompi_op_base_module_1_0_0_t, opal_object_t,
                    module_constructor_1_0_0, NULL);
 
-MCA_BASE_FRAMEWORK_DECLARE(ompi, op, NULL, NULL, NULL, NULL,
+static int
+op_base_open(mca_base_open_flag_t flags)
+{
+    ompi_op_gpu_session_pool_init();
+    return OMPI_SUCCESS;
+}
+
+static int
+op_base_close(void)
+{
+    ompi_op_gpu_session_pool_finalize();
+    return OMPI_SUCCESS;
+}
+
+MCA_BASE_FRAMEWORK_DECLARE(ompi, op, NULL, NULL, op_base_open, op_base_close,
                            mca_op_base_static_components, 0);
diff --git a/ompi/mca/op/cuda/Makefile.am b/ompi/mca/op/cuda/Makefile.am
new file mode 100644
index 00000000000..b8d8ee71dd7
--- /dev/null
+++ b/ompi/mca/op/cuda/Makefile.am
@@ -0,0 +1,84 @@
+#
+# Copyright (c) 2025      Amazon.com, Inc. or its affiliates.  All rights
+#                         reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+# C sources (compiled by the normal C compiler)
+sources = \
+    op_cuda.h \
+    op_cuda_component.c \
+    op_cuda_session.c
+
+# The .cu file is compiled separately by nvcc and linked in as a plain
+# object.  Libtool does not know how to compile CUDA, so we use a custom
+# rule.  The resulting object is appended to LIBADD for both DSO and static
+# builds.
+
+EXTRA_DIST  = op_cuda_kernels.cu
+CLEANFILES  = op_cuda_kernels.o op_cuda_kernels.lo
+
+# Include paths forwarded to nvcc so it can find ompi_config.h and the
+# op/mca headers.
+NVCC_INCLUDES = \
+    -I$(top_srcdir) \
+    -I$(top_builddir) \
+    -I$(top_srcdir)/ompi \
+    -I$(top_builddir)/ompi \
+    -I$(top_builddir)/opal \
+    -I$(top_builddir)/opal/include \
+    -I$(top_srcdir)/ompi/include \
+    -I$(top_srcdir)/opal/include
+
+# Compile the .cu file with nvcc.  Always pass -fPIC so the same object can
+# be used for both static and DSO builds.  -D_Float16=short papers over a
+# GCC extension (_Float16) that NVCC's host-compiler frontend does not support.
+op_cuda_kernels.o: $(srcdir)/op_cuda_kernels.cu \
+                   $(srcdir)/op_cuda.h
+	$(NVCC) $(NVCCFLAGS) $(NVCC_INCLUDES) \
+	    $(op_cuda_CPPFLAGS) \
+	    --compiler-options "$(DEFS) -D_Float16=short -fPIC" \
+	    -c $< -o $@
+
+# Wrap the nvcc output in a libtool object file (.lo) so it can be properly
+# included in noinst_LTLIBRARIES via LIBADD.
+op_cuda_kernels.lo: op_cuda_kernels.o
+	@{ echo '# Generated by libtool (CUDA kernel; PIC via --compiler-options)'; \
+	   echo "pic_object='op_cuda_kernels.o'"; \
+	   echo "non_pic_object='op_cuda_kernels.o'"; \
+	} > $@
+
+AM_CPPFLAGS = $(op_cuda_CPPFLAGS)
+
+# ----------------------------------------------------------------------------
+# DSO build
+# ----------------------------------------------------------------------------
+if MCA_BUILD_ompi_op_cuda_DSO
+component_install = mca_op_cuda.la
+component_noinst  =
+else
+component_install =
+component_noinst  = libmca_op_cuda.la
+endif
+
+mcacomponentdir = $(ompilibdir)
+mcacomponent_LTLIBRARIES = $(component_install)
+
+mca_op_cuda_la_SOURCES  = $(sources)
+mca_op_cuda_la_LDFLAGS  = -module -avoid-version $(op_cuda_LDFLAGS)
+mca_op_cuda_la_LIBADD   = $(op_cuda_LIBS) op_cuda_kernels.lo
+mca_op_cuda_la_CPPFLAGS = $(op_cuda_CPPFLAGS)
+
+# ----------------------------------------------------------------------------
+# Static (convenience library) build
+# ----------------------------------------------------------------------------
+noinst_LTLIBRARIES = $(component_noinst)
+
+libmca_op_cuda_la_SOURCES  = $(sources)
+libmca_op_cuda_la_LDFLAGS  = -module -avoid-version $(op_cuda_LDFLAGS)
+libmca_op_cuda_la_LIBADD   = $(op_cuda_LIBS) op_cuda_kernels.lo
+libmca_op_cuda_la_CPPFLAGS = $(op_cuda_CPPFLAGS)
diff --git a/ompi/mca/op/cuda/configure.m4 b/ompi/mca/op/cuda/configure.m4
new file mode 100644
index 00000000000..71096bca74d
--- /dev/null
+++ b/ompi/mca/op/cuda/configure.m4
@@ -0,0 +1,127 @@
+# -*- shell-script -*-
+#
+# Copyright (c) 2025      Amazon.com, Inc. or its affiliates.  All rights
+#                         reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+# MCA_ompi_op_cuda_CONFIG([action-if-can-compile],
+#                          [action-if-cant-compile])
+# ------------------------------------------------
+# Build the CUDA persistent-kernel op component when the CUDA runtime
+# toolkit (cuda_runtime.h, libcudart, nvcc) is available.
+#
+# Deliberately does NOT require CUDA_SUPPORT=1 (which gates on libcuda.so,
+# the GPU driver API library).  The op/cuda component only uses the runtime
+# API and can therefore be compiled in build environments that have the CUDA
+# toolkit installed but no GPU driver (e.g., CI containers, cross-build nodes).
+#
+# Requires --with-cuda[=DIR] to locate the toolkit.
+#
+# Sets:
+#   op_cuda_CPPFLAGS — include path for cuda_runtime.h
+#   op_cuda_LDFLAGS  — library search path for libcudart
+#   op_cuda_LIBS     — -lcudart
+#   NVCC             — path to the nvcc compiler
+#   NVCCFLAGS        — default nvcc flags (min arch SM 7.0 for __nanosleep)
+#
+AC_DEFUN([MCA_ompi_op_cuda_CONFIG],[
+    AC_CONFIG_FILES([ompi/mca/op/cuda/Makefile])
+
+    # Ensure with_cuda is defined (OPAL_CHECK_CUDA parses --with-cuda).
+    AC_REQUIRE([OPAL_CHECK_CUDA])
+
+    OPAL_VAR_SCOPE_PUSH([op_cuda_save_CPPFLAGS op_cuda_save_LDFLAGS op_cuda_save_LIBS op_cuda_libdir op_cuda_nvcc_path op_cuda_incdir])
+
+    op_cuda_happy=no
+    op_cuda_incdir=""
+
+    # Only attempt a build when the user asked for CUDA (--with-cuda[=DIR]).
+    AS_IF([test "x$with_cuda" != "x" && test "$with_cuda" != "no"],
+      [
+        # Derive the include directory from $with_cuda, mirroring OPAL_CHECK_CUDA.
+        AS_IF([test -f "${with_cuda}/include/cuda_runtime.h"],
+              [op_cuda_incdir="${with_cuda}/include"],
+              [AS_IF([test -f "${with_cuda}/cuda_runtime.h"],
+                     [op_cuda_incdir="${with_cuda}"],
+                     [AS_IF([test -f "/usr/local/cuda/include/cuda_runtime.h"],
+                            [op_cuda_incdir="/usr/local/cuda/include"])])])
+
+        op_cuda_save_CPPFLAGS="$CPPFLAGS"
+        op_cuda_save_LDFLAGS="$LDFLAGS"
+        op_cuda_save_LIBS="$LIBS"
+
+        AS_IF([test -n "$op_cuda_incdir"],
+              [CPPFLAGS="-I$op_cuda_incdir $CPPFLAGS"])
+
+        # Verify the runtime header is present.
+        AC_CHECK_HEADER([cuda_runtime.h],
+          [op_cuda_happy=yes],
+          [AC_MSG_WARN([cuda_runtime.h not found; skipping op/cuda component])
+           op_cuda_happy=no])
+
+        # Locate libcudart — prefer lib64, fall back to lib, then /usr/local/cuda.
+        AS_IF([test "$op_cuda_happy" = "yes"],
+          [op_cuda_libdir=""
+           AS_IF([test "$with_cuda" != "yes"],
+                 [AS_IF([test -d "$with_cuda/lib64"],
+                        [op_cuda_libdir="$with_cuda/lib64"],
+                        [AS_IF([test -d "$with_cuda/lib"],
+                               [op_cuda_libdir="$with_cuda/lib"])])])
+           AS_IF([test -z "$op_cuda_libdir"],
+                 [AS_IF([test -d "/usr/local/cuda/lib64"],
+                        [op_cuda_libdir="/usr/local/cuda/lib64"],
+                        [AS_IF([test -d "/usr/local/cuda/lib"],
+                               [op_cuda_libdir="/usr/local/cuda/lib"])])])
+           AS_IF([test -n "$op_cuda_libdir"],
+                 [LDFLAGS="-L$op_cuda_libdir $LDFLAGS"])
+           AC_CHECK_LIB([cudart], [cudaGetDeviceCount],
+             [op_cuda_happy=yes],
+             [AC_MSG_WARN([libcudart not found; skipping op/cuda component])
+              op_cuda_happy=no])
+          ])
+
+        # Locate nvcc.
+        AS_IF([test "$op_cuda_happy" = "yes"],
+          [op_cuda_nvcc_path="$PATH"
+           AS_IF([test "$with_cuda" != "yes" && test -d "$with_cuda/bin"],
+                 [op_cuda_nvcc_path="$with_cuda/bin:$PATH"])
+           AC_PATH_PROG([NVCC], [nvcc], [not_found], [$op_cuda_nvcc_path])
+           AS_IF([test "$NVCC" = "not_found"],
+                 [AC_MSG_WARN([nvcc not found; skipping op/cuda component])
+                  op_cuda_happy=no])
+          ])
+
+        # Populate the output variables.
+        AS_IF([test "$op_cuda_happy" = "yes"],
+          [op_cuda_CPPFLAGS="-I$op_cuda_incdir"
+           AS_IF([test -n "$op_cuda_libdir"],
+                 [op_cuda_LDFLAGS="-L$op_cuda_libdir"],
+                 [op_cuda_LDFLAGS=""])
+           op_cuda_LIBS="-lcudart"
+           # __nanosleep requires SM 7.0 (Volta) or later.
+           AS_IF([test "x$NVCCFLAGS" = "x"],
+                 [NVCCFLAGS="-arch=sm_70"])
+          ])
+
+        CPPFLAGS="$op_cuda_save_CPPFLAGS"
+        LDFLAGS="$op_cuda_save_LDFLAGS"
+        LIBS="$op_cuda_save_LIBS"
+      ])
+
+    AC_SUBST([op_cuda_CPPFLAGS])
+    AC_SUBST([op_cuda_LDFLAGS])
+    AC_SUBST([op_cuda_LIBS])
+    AC_SUBST([NVCC])
+    AC_SUBST([NVCCFLAGS])
+
+    OPAL_VAR_SCOPE_POP
+
+    AS_IF([test "$op_cuda_happy" = "yes"],
+          [$1],
+          [$2])
+])dnl
diff --git a/ompi/mca/op/cuda/op_cuda.h b/ompi/mca/op/cuda/op_cuda.h
new file mode 100644
index 00000000000..45930c8351f
--- /dev/null
+++ b/ompi/mca/op/cuda/op_cuda.h
@@ -0,0 +1,66 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2025      Amazon.com, Inc. or its affiliates.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#ifndef OMPI_MCA_OP_CUDA_H
+#define OMPI_MCA_OP_CUDA_H
+
+#include "ompi_config.h"
+#include <cuda_runtime.h>
+
+#include "ompi/mca/op/op.h"
+#include "ompi/op/op_gpu_session.h"  /* defines ompi_op_gpu_cmd_t */
+
+BEGIN_C_DECLS
+
+/**
+ * CUDA-specific cmd_queue.  Inherits ompi_op_gpu_cmd_queue_t by placing it
+ * as the first member named "super".  The CUDA stream and shutdown flag are
+ * stored directly here rather than in a separate priv allocation.
+ * Allocated with OBJ_NEW; the OBJ destructor chain releases GPU resources.
+ */
+typedef struct ompi_op_cuda_cmd_queue_t {
+    ompi_op_gpu_cmd_queue_t  super;       /* MUST be first */
+    volatile int32_t        *shutdown;    /* managed-memory shutdown flag */
+    cudaStream_t             stream;      /* private CUDA stream */
+} ompi_op_cuda_cmd_queue_t;
+OBJ_CLASS_DECLARATION(ompi_op_cuda_cmd_queue_t);
+
+/**
+ * Host-side launcher function type.
+ * Launches the persistent kernel for one (op, type) combination.
+ */
+typedef void (*ompi_op_cuda_launcher_fn_t)(ompi_op_gpu_cmd_t *cmd,
+                                           volatile int32_t  *shutdown,
+                                           cudaStream_t       stream);
+
+/**
+ * 2D table [op_index][type_index] of launcher functions.
+ * NULL entries indicate unsupported (op, type) combinations; the session
+ * machinery returns NULL for those and the caller falls back to the host path.
+ *
+ * Indexed by OMPI_OP_BASE_FORTRAN_* × OMPI_OP_BASE_TYPE_*.
+ * Defined (and initialized) in op_cuda_kernels.cu.
+ */
+OMPI_DECLSPEC extern ompi_op_cuda_launcher_fn_t
+ompi_op_cuda_kernel_fns[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+
+/* Defined in op_cuda_kernels.cu (extern "C") */
+void ompi_op_cuda_kernel_fns_init(void);
+
+/* Defined in op_cuda_session.c */
+ompi_op_gpu_cmd_queue_t *ompi_op_cuda_cmd_queue_alloc(int dev_id);
+ompi_op_gpu_session_t *ompi_op_cuda_session_begin(ompi_op_gpu_cmd_queue_t *queue,
+                                                   struct ompi_op_t *op,
+                                                   struct ompi_datatype_t *dtype);
+
+END_C_DECLS
+
+#endif /* OMPI_MCA_OP_CUDA_H */
diff --git a/ompi/mca/op/cuda/op_cuda_component.c b/ompi/mca/op/cuda/op_cuda_component.c
new file mode 100644
index 00000000000..36638f8a578
--- /dev/null
+++ b/ompi/mca/op/cuda/op_cuda_component.c
@@ -0,0 +1,101 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2025      Amazon.com, Inc. or its affiliates.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#include <cuda_runtime.h>
+
+#include "ompi/constants.h"
+#include "ompi/op/op.h"
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/base/base.h"
+#include "ompi/op/op_gpu_session.h"
+#include "ompi/mca/op/cuda/op_cuda.h"
+
+
+static int cuda_component_open(void);
+static int cuda_component_close(void);
+static int cuda_component_init_query(bool enable_progress_threads,
+                                     bool enable_mpi_thread_multiple);
+static struct ompi_op_base_module_1_0_0_t *
+    cuda_component_op_query(struct ompi_op_t *op, int *priority);
+
+/*
+ * Public component descriptor.
+ *
+ * This component does not provide per-op/per-type function pointers
+ * (opc_op_query returns NULL).  Its sole contribution is the three GPU
+ * hooks that enable persistent-kernel GPU reduction sessions.
+ */
+ompi_op_base_component_1_0_0_t mca_op_cuda_component = {
+    .opc_version = {
+        OMPI_OP_BASE_VERSION_1_0_0,
+
+        .mca_component_name = "cuda",
+        MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
+                              OMPI_RELEASE_VERSION),
+        .mca_open_component  = cuda_component_open,
+        .mca_close_component = cuda_component_close,
+    },
+    .opc_data = {
+        MCA_BASE_METADATA_PARAM_CHECKPOINT
+    },
+
+    .opc_init_query = cuda_component_init_query,
+    .opc_op_query   = cuda_component_op_query,
+
+    /* GPU session hooks */
+    .opc_cmd_queue_alloc = ompi_op_cuda_cmd_queue_alloc,
+    .opc_session_begin   = ompi_op_cuda_session_begin,
+};
+MCA_BASE_COMPONENT_INIT(ompi, op, cuda)
+
+static int
+cuda_component_open(void)
+{
+    return OMPI_SUCCESS;
+}
+
+static int
+cuda_component_close(void)
+{
+    return OMPI_SUCCESS;
+}
+
+/*
+ * Only activate this component when at least one CUDA-capable device is
+ * present in the system.
+ */
+static int
+cuda_component_init_query(bool enable_progress_threads,
+                          bool enable_mpi_thread_multiple)
+{
+    int device_count = 0;
+    cudaError_t err  = cudaGetDeviceCount(&device_count);
+    if (cudaSuccess != err || device_count <= 0) {
+        return OMPI_ERR_NOT_SUPPORTED;
+    }
+    // register launchers here, component_open seems to be never called
+    ompi_op_cuda_kernel_fns_init();
+    return OMPI_SUCCESS;
+}
+
+/*
+ * We do not provide per-op function pointers, only session hooks, so
+ * always return NULL here.
+ */
+static struct ompi_op_base_module_1_0_0_t *
+cuda_component_op_query(struct ompi_op_t *op, int *priority)
+{
+    (void) op;
+    (void) priority;
+    return NULL;
+}
diff --git a/ompi/mca/op/cuda/op_cuda_kernels.cu b/ompi/mca/op/cuda/op_cuda_kernels.cu
new file mode 100644
index 00000000000..03b1b11bd7a
--- /dev/null
+++ b/ompi/mca/op/cuda/op_cuda_kernels.cu
@@ -0,0 +1,285 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2025      Amazon.com, Inc. or its affiliates.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+/*
+ * Persistent reduction kernels for the CUDA op component.
+ *
+ * Each kernel runs one block of 256 threads and loops indefinitely,
+ * sleeping between polls to reduce power consumption.  The host posts
+ * a command by writing src/dst/count into the managed-memory slot and
+ * then setting status=1.  The kernel executes the reduction, then sets
+ * status=2.  The host spins on status until it sees 2, then resets it
+ * to 0 for the next call.  A separate shutdown flag terminates the loop
+ * at session end.
+ */
+
+#include <stdint.h>
+#include <cuda_runtime.h>
+
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/cuda/op_cuda.h"
+
+/* -------------------------------------------------------------------------
+ * PERSISTENT_KERNEL(name, ctype, op_expr)
+ *
+ * Generates __global__ void ompi_op_cuda_persistent_<name>(...).
+ * op_expr must be a statement writing dst[i] from src1[i] and src2[i],
+ * e.g. "dst[i] = src1[i] + src2[i]".  src2 may alias dst for in-place ops.
+ * ------------------------------------------------------------------------- */
+#define PERSISTENT_KERNEL(kname, ctype, op_expr)                               \
+__global__ void ompi_op_cuda_persistent_##kname(                               \
+        ompi_op_gpu_cmd_t *cmd, volatile int32_t *shutdown)                    \
+{                                                                               \
+    while (!*shutdown) {                                                        \
+        /* Spin-wait for work; sleep 1 µs between polls to save power */        \
+        while (cmd->status != 1 && !*shutdown) { __nanosleep(1000); }          \
+        if (*shutdown) break;                                                   \
+        const ctype * __restrict__ src1 = (const ctype *) cmd->src1;           \
+        const ctype * __restrict__ src2 = (const ctype *) cmd->src2;           \
+              ctype * __restrict__ dst  = (      ctype *) cmd->dst;            \
+        int64_t n = cmd->count;                                                 \
+        for (int64_t i = (int64_t)threadIdx.x; i < n; i += blockDim.x) {      \
+            op_expr;                                                            \
+        }                                                                       \
+        __syncthreads();                                                        \
+        if (threadIdx.x == 0) {                                                 \
+            __threadfence_system();   /* ensure dst writes reach host */        \
+            cmd->status = 2;          /* signal done */                         \
+        }                                                                       \
+    }                                                                           \
+}
+
+/* =========================================================================
+ * Kernel instantiations
+ * ========================================================================= */
+
+/* --- MAX --- */
+PERSISTENT_KERNEL(max_int8,   int8_t,   dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_uint8,  uint8_t,  dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_int16,  int16_t,  dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_uint16, uint16_t, dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_int32,  int32_t,  dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_uint32, uint32_t, dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_int64,  int64_t,  dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_uint64, uint64_t, dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_float,  float,    dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_double, double,   dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+
+/* --- MIN --- */
+PERSISTENT_KERNEL(min_int8,   int8_t,   dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_uint8,  uint8_t,  dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_int16,  int16_t,  dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_uint16, uint16_t, dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_int32,  int32_t,  dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_uint32, uint32_t, dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_int64,  int64_t,  dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_uint64, uint64_t, dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_float,  float,    dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_double, double,   dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+
+/* --- SUM --- */
+PERSISTENT_KERNEL(sum_int8,   int8_t,   dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_uint8,  uint8_t,  dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_int16,  int16_t,  dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_uint16, uint16_t, dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_int32,  int32_t,  dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_uint32, uint32_t, dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_int64,  int64_t,  dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_uint64, uint64_t, dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_float,  float,    dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_double, double,   dst[i] = src1[i] + src2[i])
+
+/* --- PROD --- */
+PERSISTENT_KERNEL(prod_int8,   int8_t,   dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_uint8,  uint8_t,  dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_int16,  int16_t,  dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_uint16, uint16_t, dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_int32,  int32_t,  dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_uint32, uint32_t, dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_int64,  int64_t,  dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_uint64, uint64_t, dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_float,  float,    dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_double, double,   dst[i] = src1[i] * src2[i])
+
+/* --- BAND (bitwise AND, integer types only) --- */
+PERSISTENT_KERNEL(band_int8,   int8_t,   dst[i] = src1[i] & src2[i])
+PERSISTENT_KERNEL(band_uint8,  uint8_t,  dst[i] = src1[i] & src2[i])
+PERSISTENT_KERNEL(band_int16,  int16_t,  dst[i] = src1[i] & src2[i])
+PERSISTENT_KERNEL(band_uint16, uint16_t, dst[i] = src1[i] & src2[i])
+PERSISTENT_KERNEL(band_int32,  int32_t,  dst[i] = src1[i] & src2[i])
+PERSISTENT_KERNEL(band_uint32, uint32_t, dst[i] = src1[i] & src2[i])
+PERSISTENT_KERNEL(band_int64,  int64_t,  dst[i] = src1[i] & src2[i])
+PERSISTENT_KERNEL(band_uint64, uint64_t, dst[i] = src1[i] & src2[i])
+
+/* --- BOR (bitwise OR) --- */
+PERSISTENT_KERNEL(bor_int8,   int8_t,   dst[i] = src1[i] | src2[i])
+PERSISTENT_KERNEL(bor_uint8,  uint8_t,  dst[i] = src1[i] | src2[i])
+PERSISTENT_KERNEL(bor_int16,  int16_t,  dst[i] = src1[i] | src2[i])
+PERSISTENT_KERNEL(bor_uint16, uint16_t, dst[i] = src1[i] | src2[i])
+PERSISTENT_KERNEL(bor_int32,  int32_t,  dst[i] = src1[i] | src2[i])
+PERSISTENT_KERNEL(bor_uint32, uint32_t, dst[i] = src1[i] | src2[i])
+PERSISTENT_KERNEL(bor_int64,  int64_t,  dst[i] = src1[i] | src2[i])
+PERSISTENT_KERNEL(bor_uint64, uint64_t, dst[i] = src1[i] | src2[i])
+
+/* --- BXOR (bitwise XOR) --- */
+PERSISTENT_KERNEL(bxor_int8,   int8_t,   dst[i] = src1[i] ^ src2[i])
+PERSISTENT_KERNEL(bxor_uint8,  uint8_t,  dst[i] = src1[i] ^ src2[i])
+PERSISTENT_KERNEL(bxor_int16,  int16_t,  dst[i] = src1[i] ^ src2[i])
+PERSISTENT_KERNEL(bxor_uint16, uint16_t, dst[i] = src1[i] ^ src2[i])
+PERSISTENT_KERNEL(bxor_int32,  int32_t,  dst[i] = src1[i] ^ src2[i])
+PERSISTENT_KERNEL(bxor_uint32, uint32_t, dst[i] = src1[i] ^ src2[i])
+PERSISTENT_KERNEL(bxor_int64,  int64_t,  dst[i] = src1[i] ^ src2[i])
+PERSISTENT_KERNEL(bxor_uint64, uint64_t, dst[i] = src1[i] ^ src2[i])
+
+/* =========================================================================
+ * Host-side launcher wrappers — one per kernel, 1 block × 256 threads.
+ * ========================================================================= */
+#define LAUNCHER(kname)                                                        \
+static void launch_##kname(ompi_op_gpu_cmd_t *cmd,                            \
+                            volatile int32_t  *sd,                             \
+                            cudaStream_t       stream)                         \
+{                                                                               \
+    ompi_op_cuda_persistent_##kname<<<1, 256, 0, stream>>>(cmd, sd);          \
+}
+
+LAUNCHER(max_int8)    LAUNCHER(max_uint8)
+LAUNCHER(max_int16)   LAUNCHER(max_uint16)
+LAUNCHER(max_int32)   LAUNCHER(max_uint32)
+LAUNCHER(max_int64)   LAUNCHER(max_uint64)
+LAUNCHER(max_float)   LAUNCHER(max_double)
+
+LAUNCHER(min_int8)    LAUNCHER(min_uint8)
+LAUNCHER(min_int16)   LAUNCHER(min_uint16)
+LAUNCHER(min_int32)   LAUNCHER(min_uint32)
+LAUNCHER(min_int64)   LAUNCHER(min_uint64)
+LAUNCHER(min_float)   LAUNCHER(min_double)
+
+LAUNCHER(sum_int8)    LAUNCHER(sum_uint8)
+LAUNCHER(sum_int16)   LAUNCHER(sum_uint16)
+LAUNCHER(sum_int32)   LAUNCHER(sum_uint32)
+LAUNCHER(sum_int64)   LAUNCHER(sum_uint64)
+LAUNCHER(sum_float)   LAUNCHER(sum_double)
+
+LAUNCHER(prod_int8)   LAUNCHER(prod_uint8)
+LAUNCHER(prod_int16)  LAUNCHER(prod_uint16)
+LAUNCHER(prod_int32)  LAUNCHER(prod_uint32)
+LAUNCHER(prod_int64)  LAUNCHER(prod_uint64)
+LAUNCHER(prod_float)  LAUNCHER(prod_double)
+
+LAUNCHER(band_int8)   LAUNCHER(band_uint8)
+LAUNCHER(band_int16)  LAUNCHER(band_uint16)
+LAUNCHER(band_int32)  LAUNCHER(band_uint32)
+LAUNCHER(band_int64)  LAUNCHER(band_uint64)
+
+LAUNCHER(bor_int8)    LAUNCHER(bor_uint8)
+LAUNCHER(bor_int16)   LAUNCHER(bor_uint16)
+LAUNCHER(bor_int32)   LAUNCHER(bor_uint32)
+LAUNCHER(bor_int64)   LAUNCHER(bor_uint64)
+
+LAUNCHER(bxor_int8)   LAUNCHER(bxor_uint8)
+LAUNCHER(bxor_int16)  LAUNCHER(bxor_uint16)
+LAUNCHER(bxor_int32)  LAUNCHER(bxor_uint32)
+LAUNCHER(bxor_int64)  LAUNCHER(bxor_uint64)
+
+/* =========================================================================
+ * 2D launcher table [op_index][type_index]
+ *
+ * Indexed by OMPI_OP_BASE_FORTRAN_* (rows) × OMPI_OP_BASE_TYPE_* (columns).
+ * Zero/NULL entries mean "not supported on GPU" → host fallback.
+ *
+ * Zero-initialized here; filled by ompi_op_cuda_kernel_fns_init() called
+ * from cuda_component_open().  The init function uses plain assignment
+ * instead of designated initializers to stay compatible with NVCC's C++
+ * frontend, which does not support non-trivial designated initializers.
+ * ========================================================================= */
+ompi_op_cuda_launcher_fn_t
+ompi_op_cuda_kernel_fns[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+
+void
+ompi_op_cuda_kernel_fns_init(void)
+{
+#define SET(op, type, fn) \
+    ompi_op_cuda_kernel_fns[OMPI_OP_BASE_FORTRAN_##op][OMPI_OP_BASE_TYPE_##type] = (fn)
+
+    SET(MAX, INT8_T,   launch_max_int8);
+    SET(MAX, UINT8_T,  launch_max_uint8);
+    SET(MAX, INT16_T,  launch_max_int16);
+    SET(MAX, UINT16_T, launch_max_uint16);
+    SET(MAX, INT32_T,  launch_max_int32);
+    SET(MAX, UINT32_T, launch_max_uint32);
+    SET(MAX, INT64_T,  launch_max_int64);
+    SET(MAX, UINT64_T, launch_max_uint64);
+    SET(MAX, FLOAT,    launch_max_float);
+    SET(MAX, DOUBLE,   launch_max_double);
+
+    SET(MIN, INT8_T,   launch_min_int8);
+    SET(MIN, UINT8_T,  launch_min_uint8);
+    SET(MIN, INT16_T,  launch_min_int16);
+    SET(MIN, UINT16_T, launch_min_uint16);
+    SET(MIN, INT32_T,  launch_min_int32);
+    SET(MIN, UINT32_T, launch_min_uint32);
+    SET(MIN, INT64_T,  launch_min_int64);
+    SET(MIN, UINT64_T, launch_min_uint64);
+    SET(MIN, FLOAT,    launch_min_float);
+    SET(MIN, DOUBLE,   launch_min_double);
+
+    SET(SUM, INT8_T,   launch_sum_int8);
+    SET(SUM, UINT8_T,  launch_sum_uint8);
+    SET(SUM, INT16_T,  launch_sum_int16);
+    SET(SUM, UINT16_T, launch_sum_uint16);
+    SET(SUM, INT32_T,  launch_sum_int32);
+    SET(SUM, UINT32_T, launch_sum_uint32);
+    SET(SUM, INT64_T,  launch_sum_int64);
+    SET(SUM, UINT64_T, launch_sum_uint64);
+    SET(SUM, FLOAT,    launch_sum_float);
+    SET(SUM, DOUBLE,   launch_sum_double);
+
+    SET(PROD, INT8_T,   launch_prod_int8);
+    SET(PROD, UINT8_T,  launch_prod_uint8);
+    SET(PROD, INT16_T,  launch_prod_int16);
+    SET(PROD, UINT16_T, launch_prod_uint16);
+    SET(PROD, INT32_T,  launch_prod_int32);
+    SET(PROD, UINT32_T, launch_prod_uint32);
+    SET(PROD, INT64_T,  launch_prod_int64);
+    SET(PROD, UINT64_T, launch_prod_uint64);
+    SET(PROD, FLOAT,    launch_prod_float);
+    SET(PROD, DOUBLE,   launch_prod_double);
+
+    SET(BAND, INT8_T,   launch_band_int8);
+    SET(BAND, UINT8_T,  launch_band_uint8);
+    SET(BAND, INT16_T,  launch_band_int16);
+    SET(BAND, UINT16_T, launch_band_uint16);
+    SET(BAND, INT32_T,  launch_band_int32);
+    SET(BAND, UINT32_T, launch_band_uint32);
+    SET(BAND, INT64_T,  launch_band_int64);
+    SET(BAND, UINT64_T, launch_band_uint64);
+
+    SET(BOR, INT8_T,   launch_bor_int8);
+    SET(BOR, UINT8_T,  launch_bor_uint8);
+    SET(BOR, INT16_T,  launch_bor_int16);
+    SET(BOR, UINT16_T, launch_bor_uint16);
+    SET(BOR, INT32_T,  launch_bor_int32);
+    SET(BOR, UINT32_T, launch_bor_uint32);
+    SET(BOR, INT64_T,  launch_bor_int64);
+    SET(BOR, UINT64_T, launch_bor_uint64);
+
+    SET(BXOR, INT8_T,   launch_bxor_int8);
+    SET(BXOR, UINT8_T,  launch_bxor_uint8);
+    SET(BXOR, INT16_T,  launch_bxor_int16);
+    SET(BXOR, UINT16_T, launch_bxor_uint16);
+    SET(BXOR, INT32_T,  launch_bxor_int32);
+    SET(BXOR, UINT32_T, launch_bxor_uint32);
+    SET(BXOR, INT64_T,  launch_bxor_int64);
+    SET(BXOR, UINT64_T, launch_bxor_uint64);
+
+    /* LAND, LOR, LXOR, MAXLOC, MINLOC, REPLACE, NO_OP: NULL → host path */
+#undef SET
+}
diff --git a/ompi/mca/op/cuda/op_cuda_session.c b/ompi/mca/op/cuda/op_cuda_session.c
new file mode 100644
index 00000000000..c51c47307a8
--- /dev/null
+++ b/ompi/mca/op/cuda/op_cuda_session.c
@@ -0,0 +1,244 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2025      Amazon.com, Inc. or its affiliates.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+/*
+ * Session lifecycle for the CUDA persistent-kernel op component.
+ *
+ * ompi_op_cuda_cmd_queue_t inherits ompi_op_gpu_cmd_queue_t.  OBJ_NEW
+ * allocates the object; the OBJ destructor releases the CUDA stream and
+ * managed memory.  The component returns the base pointer from alloc so
+ * callers need no knowledge of the concrete type.
+ *
+ * session_begin:   look up the kernel for (op, dtype), reset the cmd_queue
+ *                  state, and launch the persistent kernel on the existing
+ *                  stream.  Wires all session dispatch hooks and returns the
+ *                  session.  Returns NULL if no kernel exists.
+ *
+ * session_reduce:  write src/dst/count to the command slot, set status=1
+ *                  to wake the kernel, and spin until status==2.
+ *
+ * session_stop:    signal the persistent kernel to exit and synchronize the
+ *                  stream.  The cmd_queue's GPU stream and managed memory
+ *                  remain allocated for reuse.
+ */
+
+#include "ompi_config.h"
+#include <stdbool.h>
+#include <stdlib.h>
+#include <sched.h>
+
+#include <cuda_runtime.h>
+
+#include "opal/mca/accelerator/base/base.h"
+#include "ompi/op/op.h"
+#include "ompi/datatype/ompi_datatype.h"
+#include "ompi/op/op_gpu_session.h"
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/cuda/op_cuda.h"
+
+/* ompi_op_ddt_map[] maps dtype->id → OMPI_OP_BASE_TYPE_* (-1 if none) */
+extern int ompi_op_ddt_map[OMPI_DATATYPE_MAX_PREDEFINED];
+
+/* Forward declarations of static session hooks referenced from session_begin. */
+static void ompi_op_cuda_session_reduce(ompi_op_gpu_session_t *session,
+                                         const void *src1, const void *src2,
+                                         void *dst, size_t count);
+static void ompi_op_cuda_session_stop(ompi_op_gpu_session_t *session);
+
+/* --------------------------------------------------------------------------
+ * OBJ constructor / destructor for ompi_op_cuda_cmd_queue_t
+ * -------------------------------------------------------------------------- */
+static void
+ompi_op_cuda_cmd_queue_construct(ompi_op_cuda_cmd_queue_t *q)
+{
+    q->shutdown       = NULL;
+    q->stream         = NULL;
+    q->super.cmd      = NULL;
+    q->super.dev_id   = -1;
+    q->super.allocator = NULL;
+    q->super.session_begin_fn = NULL;
+}
+
+static void
+ompi_op_cuda_cmd_queue_destruct(ompi_op_cuda_cmd_queue_t *q)
+{
+    if (NULL != q->stream) {
+        cudaStreamDestroy(q->stream);
+        q->stream = NULL;
+    }
+    if (NULL != q->shutdown) {
+        cudaFree((void *) q->shutdown);
+        q->shutdown = NULL;
+    }
+    if (NULL != q->super.cmd) {
+        cudaFree(q->super.cmd);
+        q->super.cmd = NULL;
+    }
+}
+
+OBJ_CLASS_INSTANCE(ompi_op_cuda_cmd_queue_t,
+                   ompi_op_gpu_cmd_queue_t,
+                   ompi_op_cuda_cmd_queue_construct,
+                   ompi_op_cuda_cmd_queue_destruct);
+
+/* --------------------------------------------------------------------------
+ * ompi_op_cuda_cmd_queue_alloc
+ *
+ * Allocate the expensive GPU resources for one device: a managed-memory
+ * command slot, a managed-memory shutdown flag, and a private CUDA stream.
+ * Returns the base pointer (ompi_op_gpu_cmd_queue_t *); NULL on failure.
+ * -------------------------------------------------------------------------- */
+ompi_op_gpu_cmd_queue_t *
+ompi_op_cuda_cmd_queue_alloc(int dev_id)
+{
+    ompi_op_cuda_cmd_queue_t *q = OBJ_NEW(ompi_op_cuda_cmd_queue_t);
+    if (NULL == q) {
+        return NULL;
+    }
+
+    cudaError_t err;
+
+    err = cudaMallocManaged((void **) &q->super.cmd,
+                            sizeof(ompi_op_gpu_cmd_t),
+                            cudaMemAttachGlobal);
+    if (cudaSuccess != err) {
+        OBJ_RELEASE(q);
+        return NULL;
+    }
+    q->super.cmd->src1   = NULL;
+    q->super.cmd->src2   = NULL;
+    q->super.cmd->dst    = NULL;
+    q->super.cmd->count  = 0;
+    q->super.cmd->status = 0;
+
+    err = cudaMallocManaged((void **) &q->shutdown,
+                            sizeof(int32_t),
+                            cudaMemAttachGlobal);
+    if (cudaSuccess != err) {
+        OBJ_RELEASE(q);
+        return NULL;
+    }
+    *q->shutdown = 0;
+
+    err = cudaStreamCreateWithFlags(&q->stream, cudaStreamNonBlocking);
+    if (cudaSuccess != err) {
+        OBJ_RELEASE(q);
+        return NULL;
+    }
+
+    q->super.dev_id    = dev_id;
+    q->super.allocator = opal_accelerator_base_get_device_allocator(dev_id);
+    return &q->super;
+}
+
+/* --------------------------------------------------------------------------
+ * ompi_op_cuda_session_begin
+ *
+ * Look up the GPU kernel for (op, dtype), reset the cmd_queue state, and
+ * launch the persistent kernel on the existing stream.  Wires all session
+ * dispatch hooks before returning.  Returns NULL if no GPU kernel exists
+ * for this combination or if the kernel launch fails.
+ * -------------------------------------------------------------------------- */
+ompi_op_gpu_session_t *
+ompi_op_cuda_session_begin(ompi_op_gpu_cmd_queue_t *queue,
+                            struct ompi_op_t *op,
+                            struct ompi_datatype_t *dtype)
+{
+    int op_idx   = op->o_f_to_c_index;
+    int type_idx = (dtype->id < OMPI_DATATYPE_MAX_PREDEFINED)
+                   ? ompi_op_ddt_map[dtype->id] : -1;
+
+    if (op_idx  < 0 || op_idx  >= OMPI_OP_BASE_FORTRAN_OP_MAX ||
+        type_idx < 0 || type_idx >= OMPI_OP_BASE_TYPE_MAX) {
+        return NULL;
+    }
+
+    ompi_op_cuda_launcher_fn_t launcher = ompi_op_cuda_kernel_fns[op_idx][type_idx];
+    if (NULL == launcher) {
+        return NULL;
+    }
+
+    ompi_op_cuda_cmd_queue_t *cq = (ompi_op_cuda_cmd_queue_t *) queue;
+
+    /* Reset queue state for the new kernel */
+    *cq->shutdown        = 0;
+    queue->cmd->src1     = NULL;
+    queue->cmd->src2     = NULL;
+    queue->cmd->dst      = NULL;
+    queue->cmd->count    = 0;
+    queue->cmd->status   = 0;
+
+    /* Launch the persistent kernel */
+    launcher(queue->cmd, cq->shutdown, cq->stream);
+    cudaError_t err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        return NULL;
+    }
+
+    ompi_op_gpu_session_t *session =
+        (ompi_op_gpu_session_t *) malloc(sizeof(ompi_op_gpu_session_t));
+    if (NULL == session) {
+        return NULL;
+    }
+
+    session->queue     = queue;
+    session->allocator = queue->allocator;
+    session->reduce_fn = ompi_op_cuda_session_reduce;
+    session->stop_fn   = ompi_op_cuda_session_stop;
+    return session;
+}
+
+/* --------------------------------------------------------------------------
+ * ompi_op_cuda_session_reduce
+ * -------------------------------------------------------------------------- */
+static void
+ompi_op_cuda_session_reduce(ompi_op_gpu_session_t *session,
+                             const void *src1, const void *src2,
+                             void *dst, size_t count)
+{
+    ompi_op_gpu_cmd_t *cmd = session->queue->cmd;
+
+    /* Write operands before signalling the kernel */
+    cmd->src1  = src1;
+    cmd->src2  = src2;
+    cmd->dst   = dst;
+    cmd->count = (int64_t) count;
+
+    __atomic_thread_fence(__ATOMIC_SEQ_CST);   /* ensure writes visible to GPU */
+    cmd->status = 1;                           /* wake the kernel */
+
+    /* Spin-wait for the kernel to signal completion */
+    while (2 != cmd->status) {
+        sched_yield();   /* relinquish CPU timeslice while waiting */
+    }
+
+    /* Reset for the next call */
+    cmd->status = 0;
+}
+
+/* --------------------------------------------------------------------------
+ * ompi_op_cuda_session_stop
+ *
+ * Signal the persistent kernel to exit and wait for the stream to drain.
+ * The cmd_queue's stream and managed memory remain allocated for reuse.
+ * -------------------------------------------------------------------------- */
+static void
+ompi_op_cuda_session_stop(ompi_op_gpu_session_t *session)
+{
+    ompi_op_cuda_cmd_queue_t *cq = (ompi_op_cuda_cmd_queue_t *) session->queue;
+
+    /* Signal the kernel to exit its loop */
+    *cq->shutdown = 1;
+    __atomic_thread_fence(__ATOMIC_SEQ_CST);
+
+    /* Wait for the kernel to finish; stream remains valid after this */
+    cudaStreamSynchronize(cq->stream);
+}
diff --git a/ompi/mca/op/op.h b/ompi/mca/op/op.h
index 34d26376ab9..ffb21da034a 100644
--- a/ompi/mca/op/op.h
+++ b/ompi/mca/op/op.h
@@ -326,6 +326,30 @@ typedef struct ompi_op_base_module_1_0_0_t *
   (*ompi_op_base_component_op_query_1_0_0_fn_t)
     (struct ompi_op_t *op, int *priority);
 
+/* Forward declarations for GPU types (defined in ompi/op/op_gpu_session.h) */
+struct ompi_op_gpu_cmd_queue_t;
+struct ompi_op_gpu_session_t;
+
+/**
+ * Optional component hook: allocate the expensive GPU resources for a
+ * cmd_queue on the given device: managed-memory command slot, shutdown flag,
+ * and a private GPU stream.  Returns NULL on allocation failure.
+ * The caller (op_gpu_session.c) wires session_begin_fn and free_fn.
+ */
+typedef struct ompi_op_gpu_cmd_queue_t *
+  (*ompi_op_base_component_cmd_queue_alloc_fn_t)(int dev_id);
+
+/**
+ * Optional component hook: look up the GPU kernel for (op, dtype), reset the
+ * cmd_queue state, and launch the persistent kernel on the queue's stream.
+ * Returns a fully-wired ompi_op_gpu_session_t on success, NULL if no GPU
+ * kernel exists for this (op, dtype) combination.
+ */
+typedef struct ompi_op_gpu_session_t *
+  (*ompi_op_base_component_session_begin_fn_t)(struct ompi_op_gpu_cmd_queue_t *queue,
+                                               struct ompi_op_t *op,
+                                               struct ompi_datatype_t *dtype);
+
 /**
  * Op component interface.
  *
@@ -343,6 +367,10 @@ typedef struct ompi_op_base_component_1_0_0_t {
     ompi_op_base_component_init_query_fn_t opc_init_query;
     /** Query whether component is usable for given op */
     ompi_op_base_component_op_query_1_0_0_fn_t opc_op_query;
+
+    /** Optional: GPU cmd_queue and session hooks.  NULL in host-only components. */
+    ompi_op_base_component_cmd_queue_alloc_fn_t  opc_cmd_queue_alloc;
+    ompi_op_base_component_session_begin_fn_t    opc_session_begin;
 } ompi_op_base_component_1_0_0_t;
 
 
diff --git a/ompi/mca/op/rocm/Makefile.am b/ompi/mca/op/rocm/Makefile.am
new file mode 100644
index 00000000000..a3bdfe08c8a
--- /dev/null
+++ b/ompi/mca/op/rocm/Makefile.am
@@ -0,0 +1,73 @@
+#
+# Copyright (c) 2025      Amazon.com, Inc. or its affiliates.  All rights
+#                         reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+# C sources (compiled by the normal C compiler)
+sources = \
+    op_rocm.h \
+    op_rocm_component.c \
+    op_rocm_session.c
+
+# The .cpp file is compiled separately by hipcc and linked in as a plain
+# object.  Libtool does not know how to compile HIP C++, so we use a custom
+# rule.  The resulting object is appended to LIBADD for both DSO and static
+# builds.
+
+rocm_sources  = op_rocm_kernels.cpp
+CLEANFILES    = op_rocm_kernels.o
+
+# Include paths forwarded to hipcc so it can find ompi_config.h and the
+# op/mca headers.
+HIPCC_INCLUDES = \
+    -I$(top_srcdir) \
+    -I$(top_builddir) \
+    -I$(top_srcdir)/ompi \
+    -I$(top_builddir)/ompi \
+    -I$(top_builddir)/opal \
+    -I$(top_builddir)/opal/include \
+    -I$(top_srcdir)/ompi/include \
+    -I$(top_srcdir)/opal/include
+
+op_rocm_kernels.l$(OBJEXT): $(srcdir)/op_rocm_kernels.cpp \
+                   $(srcdir)/op_rocm.h
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile \
+		$(HIPCC) $(HIPCCFLAGS) $(HIPCC_INCLUDES) \
+	    $(op_rocm_CPPFLAGS) \
+	    -c $<
+
+AM_CPPFLAGS = $(op_rocm_CPPFLAGS)
+
+# ----------------------------------------------------------------------------
+# DSO build
+# ----------------------------------------------------------------------------
+if MCA_BUILD_ompi_op_rocm_DSO
+component_install = mca_op_rocm.la
+component_noinst  =
+else
+component_install =
+component_noinst  = libmca_op_rocm.la
+endif
+
+mcacomponentdir = $(ompilibdir)
+mcacomponent_LTLIBRARIES = $(component_install)
+
+mca_op_rocm_la_SOURCES  = $(sources)
+mca_op_rocm_la_LDFLAGS  = -module -avoid-version $(op_rocm_LDFLAGS)
+mca_op_rocm_la_LIBADD   = $(rocm_sources:.cpp=.lo)
+mca_op_rocm_la_CPPFLAGS = $(op_rocm_CPPFLAGS)
+
+# ----------------------------------------------------------------------------
+# Static (convenience library) build
+# ----------------------------------------------------------------------------
+noinst_LTLIBRARIES = $(component_noinst)
+
+libmca_op_rocm_la_SOURCES  = $(sources)
+libmca_op_rocm_la_LDFLAGS  = -module -avoid-version $(op_rocm_LDFLAGS)
+libmca_op_rocm_la_LIBADD   = $(op_rocm_LIBS) $(rocm_sources:.cpp=.lo)
+libmca_op_rocm_la_CPPFLAGS = $(op_rocm_CPPFLAGS)
diff --git a/ompi/mca/op/rocm/configure.m4 b/ompi/mca/op/rocm/configure.m4
new file mode 100644
index 00000000000..6625aa06f7a
--- /dev/null
+++ b/ompi/mca/op/rocm/configure.m4
@@ -0,0 +1,70 @@
+# -*- shell-script -*-
+#
+# Copyright (c) 2025      Amazon.com, Inc. or its affiliates.  All rights
+#                         reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+# MCA_ompi_op_rocm_CONFIG([action-if-can-compile],
+#                          [action-if-cant-compile])
+# ------------------------------------------------
+# Build the ROCm persistent-kernel op component only when the HIP runtime
+# (libamdhip64 + hip/hip_runtime.h) and hipcc are available.
+#
+# Calls OPAL_CHECK_ROCM to locate headers and libraries, then separately
+# finds hipcc.  Sets:
+#   op_rocm_CPPFLAGS — include/define flags for HIP (includes -D__HIP_PLATFORM_AMD__)
+#   op_rocm_LDFLAGS  — library search path for libamdhip64
+#   op_rocm_LIBS     — -lamdhip64
+#   HIPCC            — path to the hipcc compiler
+#   HIPCCFLAGS       — default hipcc flags
+#
+AC_DEFUN([MCA_ompi_op_rocm_CONFIG],[
+    AC_CONFIG_FILES([ompi/mca/op/rocm/Makefile])
+
+    OPAL_VAR_SCOPE_PUSH([op_rocm_happy op_rocm_hipcc_path])
+
+    op_rocm_happy=no
+
+    # OPAL_CHECK_ROCM calls OAC_CHECK_PACKAGE and sets:
+    #   op_rocm_CPPFLAGS, op_rocm_LDFLAGS, op_rocm_LIBS
+    # It also sets ROCM_SUPPORT=1 on success.
+    OPAL_CHECK_ROCM([op_rocm],
+        [op_rocm_happy=yes],
+        [op_rocm_happy=no])
+
+    # Find hipcc alongside the ROCm installation.
+    AS_IF([test "$op_rocm_happy" = "yes"],
+      [op_rocm_hipcc_path="$PATH"
+       AS_IF([test -n "$with_rocm" && test "$with_rocm" != "no" && test -d "$with_rocm/bin"],
+             [op_rocm_hipcc_path="$with_rocm/bin:$PATH"],
+             [AS_IF([test -d "/opt/rocm/bin"],
+                    [op_rocm_hipcc_path="/opt/rocm/bin:$PATH"])])
+       AC_PATH_PROG([HIPCC], [hipcc], [not_found], [$op_rocm_hipcc_path])
+       AS_IF([test "$HIPCC" = "not_found"],
+             [AC_MSG_WARN([hipcc not found; skipping op/rocm component])
+              op_rocm_happy=no])
+      ])
+
+    OPAL_SUMMARY_ADD([Accelerators], [ROCm operator support], [], [$op_rocm_happy])
+    # Default HIPCCFLAGS if not already set by the user.
+    AS_IF([test "$op_rocm_happy" = "yes" && test "x$HIPCCFLAGS" = "x"],
+          [HIPCCFLAGS="--offload-arch=gfx906"])
+
+    AC_SUBST([op_rocm_CPPFLAGS])
+    AC_SUBST([op_rocm_LDFLAGS])
+    AC_SUBST([op_rocm_LIBS])
+    AC_SUBST([HIPCC])
+    AC_SUBST([HIPCCFLAGS])
+
+    AS_IF([test "$op_rocm_happy" = "yes"],
+          [$1],
+          [$2])
+
+    OPAL_VAR_SCOPE_POP
+
+])dnl
diff --git a/ompi/mca/op/rocm/op_rocm.h b/ompi/mca/op/rocm/op_rocm.h
new file mode 100644
index 00000000000..df80d4f915e
--- /dev/null
+++ b/ompi/mca/op/rocm/op_rocm.h
@@ -0,0 +1,66 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2025      Amazon.com, Inc. or its affiliates.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#ifndef OMPI_MCA_OP_ROCM_H
+#define OMPI_MCA_OP_ROCM_H
+
+#include "ompi_config.h"
+#include <hip/hip_runtime.h>
+
+#include "ompi/mca/op/op.h"
+#include "ompi/op/op_gpu_session.h"  /* defines ompi_op_gpu_cmd_t */
+
+BEGIN_C_DECLS
+
+/**
+ * ROCm-specific cmd_queue.  Inherits ompi_op_gpu_cmd_queue_t by placing it
+ * as the first member named "super".  The HIP stream and shutdown flag are
+ * stored directly here rather than in a separate priv allocation.
+ * Allocated with OBJ_NEW; the OBJ destructor chain releases GPU resources.
+ */
+typedef struct ompi_op_rocm_cmd_queue_t {
+    ompi_op_gpu_cmd_queue_t  super;       /* MUST be first */
+    volatile int32_t        *shutdown;    /* managed-memory shutdown flag */
+    hipStream_t              stream;      /* private HIP stream */
+} ompi_op_rocm_cmd_queue_t;
+OBJ_CLASS_DECLARATION(ompi_op_rocm_cmd_queue_t);
+
+/**
+ * Host-side launcher function type.
+ * Launches the persistent kernel for one (op, type) combination.
+ */
+typedef void (*ompi_op_rocm_launcher_fn_t)(ompi_op_gpu_cmd_t *cmd,
+                                           volatile int32_t  *shutdown,
+                                           hipStream_t        stream);
+
+/**
+ * 2D table [op_index][type_index] of launcher functions.
+ * NULL entries indicate unsupported (op, type) combinations; the session
+ * machinery returns NULL for those and the caller falls back to the host path.
+ *
+ * Indexed by OMPI_OP_BASE_FORTRAN_* × OMPI_OP_BASE_TYPE_*.
+ * Defined (and initialized) in op_rocm_kernels.cpp.
+ */
+OMPI_DECLSPEC extern ompi_op_rocm_launcher_fn_t
+ompi_op_rocm_kernel_fns[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+
+/* Defined in op_rocm_kernels.cpp (extern "C") */
+void ompi_op_rocm_kernel_fns_init(void);
+
+/* Defined in op_rocm_session.c */
+ompi_op_gpu_cmd_queue_t *ompi_op_rocm_cmd_queue_alloc(int dev_id);
+ompi_op_gpu_session_t *ompi_op_rocm_session_begin(ompi_op_gpu_cmd_queue_t *queue,
+                                                   struct ompi_op_t *op,
+                                                   struct ompi_datatype_t *dtype);
+
+END_C_DECLS
+
+#endif /* OMPI_MCA_OP_ROCM_H */
diff --git a/ompi/mca/op/rocm/op_rocm_component.c b/ompi/mca/op/rocm/op_rocm_component.c
new file mode 100644
index 00000000000..7966a79f1e2
--- /dev/null
+++ b/ompi/mca/op/rocm/op_rocm_component.c
@@ -0,0 +1,96 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2025      Amazon.com, Inc. or its affiliates.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#include <hip/hip_runtime.h>
+
+#include "ompi/constants.h"
+#include "ompi/op/op.h"
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/base/base.h"
+#include "ompi/op/op_gpu_session.h"
+#include "ompi/mca/op/rocm/op_rocm.h"
+
+
+static int rocm_component_open(void);
+static int rocm_component_close(void);
+static int rocm_component_init_query(bool enable_progress_threads,
+                                      bool enable_mpi_thread_multiple);
+static struct ompi_op_base_module_1_0_0_t *
+    rocm_component_op_query(struct ompi_op_t *op, int *priority);
+
+/*
+ * Public component descriptor.
+ */
+ompi_op_base_component_1_0_0_t mca_op_rocm_component = {
+    .opc_version = {
+        OMPI_OP_BASE_VERSION_1_0_0,
+
+        .mca_component_name = "rocm",
+        MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
+                              OMPI_RELEASE_VERSION),
+        .mca_open_component  = rocm_component_open,
+        .mca_close_component = rocm_component_close,
+    },
+    .opc_data = {
+        MCA_BASE_METADATA_PARAM_CHECKPOINT
+    },
+
+    .opc_init_query = rocm_component_init_query,
+    .opc_op_query   = rocm_component_op_query,
+
+    /* GPU session hooks */
+    .opc_cmd_queue_alloc = ompi_op_rocm_cmd_queue_alloc,
+    .opc_session_begin   = ompi_op_rocm_session_begin,
+};
+MCA_BASE_COMPONENT_INIT(ompi, op, rocm)
+
+static int
+rocm_component_open(void)
+{
+    ompi_op_rocm_kernel_fns_init();
+    return OMPI_SUCCESS;
+}
+
+static int
+rocm_component_close(void)
+{
+    return OMPI_SUCCESS;
+}
+
+/*
+ * Only activate this component when at least one ROCm-capable device is
+ * present in the system.
+ */
+static int
+rocm_component_init_query(bool enable_progress_threads,
+                           bool enable_mpi_thread_multiple)
+{
+    int device_count = 0;
+    hipError_t err   = hipGetDeviceCount(&device_count);
+    if (hipSuccess != err || device_count <= 0) {
+        return OMPI_ERR_NOT_SUPPORTED;
+    }
+    return OMPI_SUCCESS;
+}
+
+/*
+ * We do not provide per-op function pointers, only session hooks, so
+ * always return NULL here.
+ */
+static struct ompi_op_base_module_1_0_0_t *
+rocm_component_op_query(struct ompi_op_t *op, int *priority)
+{
+    (void) op;
+    (void) priority;
+    return NULL;
+}
diff --git a/ompi/mca/op/rocm/op_rocm_kernels.cpp b/ompi/mca/op/rocm/op_rocm_kernels.cpp
new file mode 100644
index 00000000000..d492957c002
--- /dev/null
+++ b/ompi/mca/op/rocm/op_rocm_kernels.cpp
@@ -0,0 +1,285 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2025      Amazon.com, Inc. or its affiliates.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+/*
+ * Persistent reduction kernels for the ROCm op component.
+ *
+ * Structure mirrors op_cuda_kernels.cu; differences from the CUDA version:
+ *   - #include <hip/hip_runtime.h> instead of <cuda_runtime.h>
+ *   - __builtin_amdgcn_s_sleep(1) replaces __nanosleep(1000) for the
+ *     low-power spin-wait (GCN instruction; ~64 clock sleep)
+ *   - All cuda* API calls replaced by their hip* equivalents
+ *   - Launcher macro uses ompi_op_rocm_persistent_* prefix
+ */
+
+#include <stdint.h>
+#include <hip/hip_runtime.h>
+
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/rocm/op_rocm.h"
+
+/* -------------------------------------------------------------------------
+ * PERSISTENT_KERNEL(name, ctype, op_expr)
+ *
+ * Generates __global__ void ompi_op_rocm_persistent_<name>(...).
+ * op_expr must be a statement writing dst[i] from src1[i] and src2[i],
+ * e.g. "dst[i] = src1[i] + src2[i]".  src2 may alias dst for in-place ops.
+ * ------------------------------------------------------------------------- */
+#define PERSISTENT_KERNEL(kname, ctype, op_expr)                               \
+__global__ void ompi_op_rocm_persistent_##kname(                               \
+        ompi_op_gpu_cmd_t *cmd, volatile int32_t *shutdown)                    \
+{                                                                               \
+    while (!*shutdown) {                                                        \
+        /* Spin-wait for work; sleep ~64 clocks between polls to save power */ \
+        while (cmd->status != 1 && !*shutdown) {                               \
+            __builtin_amdgcn_s_sleep(1);                                       \
+        }                                                                       \
+        if (*shutdown) break;                                                   \
+        const ctype * __restrict__ src1 = (const ctype *) cmd->src1;           \
+        const ctype * __restrict__ src2 = (const ctype *) cmd->src2;           \
+              ctype * __restrict__ dst  = (      ctype *) cmd->dst;            \
+        int64_t n = cmd->count;                                                 \
+        for (int64_t i = (int64_t)threadIdx.x; i < n; i += blockDim.x) {      \
+            op_expr;                                                            \
+        }                                                                       \
+        __syncthreads();                                                        \
+        if (threadIdx.x == 0) {                                                 \
+            __threadfence_system();   /* ensure dst writes reach host */        \
+            cmd->status = 2;          /* signal done */                         \
+        }                                                                       \
+    }                                                                           \
+}
+
+/* =========================================================================
+ * Kernel instantiations
+ * ========================================================================= */
+
+/* --- MAX --- */
+PERSISTENT_KERNEL(max_int8,   int8_t,   dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_uint8,  uint8_t,  dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_int16,  int16_t,  dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_uint16, uint16_t, dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_int32,  int32_t,  dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_uint32, uint32_t, dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_int64,  int64_t,  dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_uint64, uint64_t, dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_float,  float,    dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_double, double,   dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+
+/* --- MIN --- */
+PERSISTENT_KERNEL(min_int8,   int8_t,   dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_uint8,  uint8_t,  dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_int16,  int16_t,  dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_uint16, uint16_t, dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_int32,  int32_t,  dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_uint32, uint32_t, dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_int64,  int64_t,  dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_uint64, uint64_t, dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_float,  float,    dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_double, double,   dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+
+/* --- SUM --- */
+PERSISTENT_KERNEL(sum_int8,   int8_t,   dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_uint8,  uint8_t,  dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_int16,  int16_t,  dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_uint16, uint16_t, dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_int32,  int32_t,  dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_uint32, uint32_t, dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_int64,  int64_t,  dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_uint64, uint64_t, dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_float,  float,    dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_double, double,   dst[i] = src1[i] + src2[i])
+
+/* --- PROD --- */
+PERSISTENT_KERNEL(prod_int8,   int8_t,   dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_uint8,  uint8_t,  dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_int16,  int16_t,  dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_uint16, uint16_t, dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_int32,  int32_t,  dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_uint32, uint32_t, dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_int64,  int64_t,  dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_uint64, uint64_t, dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_float,  float,    dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_double, double,   dst[i] = src1[i] * src2[i])
+
+/* --- BAND (bitwise AND, integer types only) --- */
+PERSISTENT_KERNEL(band_int8,   int8_t,   dst[i] = src1[i] & src2[i])
+PERSISTENT_KERNEL(band_uint8,  uint8_t,  dst[i] = src1[i] & src2[i])
+PERSISTENT_KERNEL(band_int16,  int16_t,  dst[i] = src1[i] & src2[i])
+PERSISTENT_KERNEL(band_uint16, uint16_t, dst[i] = src1[i] & src2[i])
+PERSISTENT_KERNEL(band_int32,  int32_t,  dst[i] = src1[i] & src2[i])
+PERSISTENT_KERNEL(band_uint32, uint32_t, dst[i] = src1[i] & src2[i])
+PERSISTENT_KERNEL(band_int64,  int64_t,  dst[i] = src1[i] & src2[i])
+PERSISTENT_KERNEL(band_uint64, uint64_t, dst[i] = src1[i] & src2[i])
+
+/* --- BOR (bitwise OR) --- */
+PERSISTENT_KERNEL(bor_int8,   int8_t,   dst[i] = src1[i] | src2[i])
+PERSISTENT_KERNEL(bor_uint8,  uint8_t,  dst[i] = src1[i] | src2[i])
+PERSISTENT_KERNEL(bor_int16,  int16_t,  dst[i] = src1[i] | src2[i])
+PERSISTENT_KERNEL(bor_uint16, uint16_t, dst[i] = src1[i] | src2[i])
+PERSISTENT_KERNEL(bor_int32,  int32_t,  dst[i] = src1[i] | src2[i])
+PERSISTENT_KERNEL(bor_uint32, uint32_t, dst[i] = src1[i] | src2[i])
+PERSISTENT_KERNEL(bor_int64,  int64_t,  dst[i] = src1[i] | src2[i])
+PERSISTENT_KERNEL(bor_uint64, uint64_t, dst[i] = src1[i] | src2[i])
+
+/* --- BXOR (bitwise XOR) --- */
+PERSISTENT_KERNEL(bxor_int8,   int8_t,   dst[i] = src1[i] ^ src2[i])
+PERSISTENT_KERNEL(bxor_uint8,  uint8_t,  dst[i] = src1[i] ^ src2[i])
+PERSISTENT_KERNEL(bxor_int16,  int16_t,  dst[i] = src1[i] ^ src2[i])
+PERSISTENT_KERNEL(bxor_uint16, uint16_t, dst[i] = src1[i] ^ src2[i])
+PERSISTENT_KERNEL(bxor_int32,  int32_t,  dst[i] = src1[i] ^ src2[i])
+PERSISTENT_KERNEL(bxor_uint32, uint32_t, dst[i] = src1[i] ^ src2[i])
+PERSISTENT_KERNEL(bxor_int64,  int64_t,  dst[i] = src1[i] ^ src2[i])
+PERSISTENT_KERNEL(bxor_uint64, uint64_t, dst[i] = src1[i] ^ src2[i])
+
+/* =========================================================================
+ * Host-side launcher wrappers — one per kernel, 1 block × 256 threads.
+ * ========================================================================= */
+#define LAUNCHER(kname)                                                        \
+static void launch_##kname(ompi_op_gpu_cmd_t *cmd,                            \
+                            volatile int32_t  *sd,                             \
+                            hipStream_t        stream)                         \
+{                                                                               \
+    ompi_op_rocm_persistent_##kname<<<1, 256, 0, stream>>>(cmd, sd);          \
+}
+
+LAUNCHER(max_int8)    LAUNCHER(max_uint8)
+LAUNCHER(max_int16)   LAUNCHER(max_uint16)
+LAUNCHER(max_int32)   LAUNCHER(max_uint32)
+LAUNCHER(max_int64)   LAUNCHER(max_uint64)
+LAUNCHER(max_float)   LAUNCHER(max_double)
+
+LAUNCHER(min_int8)    LAUNCHER(min_uint8)
+LAUNCHER(min_int16)   LAUNCHER(min_uint16)
+LAUNCHER(min_int32)   LAUNCHER(min_uint32)
+LAUNCHER(min_int64)   LAUNCHER(min_uint64)
+LAUNCHER(min_float)   LAUNCHER(min_double)
+
+LAUNCHER(sum_int8)    LAUNCHER(sum_uint8)
+LAUNCHER(sum_int16)   LAUNCHER(sum_uint16)
+LAUNCHER(sum_int32)   LAUNCHER(sum_uint32)
+LAUNCHER(sum_int64)   LAUNCHER(sum_uint64)
+LAUNCHER(sum_float)   LAUNCHER(sum_double)
+
+LAUNCHER(prod_int8)   LAUNCHER(prod_uint8)
+LAUNCHER(prod_int16)  LAUNCHER(prod_uint16)
+LAUNCHER(prod_int32)  LAUNCHER(prod_uint32)
+LAUNCHER(prod_int64)  LAUNCHER(prod_uint64)
+LAUNCHER(prod_float)  LAUNCHER(prod_double)
+
+LAUNCHER(band_int8)   LAUNCHER(band_uint8)
+LAUNCHER(band_int16)  LAUNCHER(band_uint16)
+LAUNCHER(band_int32)  LAUNCHER(band_uint32)
+LAUNCHER(band_int64)  LAUNCHER(band_uint64)
+
+LAUNCHER(bor_int8)    LAUNCHER(bor_uint8)
+LAUNCHER(bor_int16)   LAUNCHER(bor_uint16)
+LAUNCHER(bor_int32)   LAUNCHER(bor_uint32)
+LAUNCHER(bor_int64)   LAUNCHER(bor_uint64)
+
+LAUNCHER(bxor_int8)   LAUNCHER(bxor_uint8)
+LAUNCHER(bxor_int16)  LAUNCHER(bxor_uint16)
+LAUNCHER(bxor_int32)  LAUNCHER(bxor_uint32)
+LAUNCHER(bxor_int64)  LAUNCHER(bxor_uint64)
+
+/* =========================================================================
+ * 2D launcher table [op_index][type_index]
+ *
+ * Indexed by OMPI_OP_BASE_FORTRAN_* (rows) × OMPI_OP_BASE_TYPE_* (columns).
+ * Zero/NULL entries mean "not supported on GPU" → host fallback.
+ *
+ * Zero-initialized here; filled by ompi_op_rocm_kernel_fns_init() called
+ * from rocm_component_open().  Plain assignment avoids non-trivial designated
+ * initializers which are not supported by all GPU compiler C++ frontends.
+ * ========================================================================= */
+ompi_op_rocm_launcher_fn_t
+ompi_op_rocm_kernel_fns[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+
+void
+ompi_op_rocm_kernel_fns_init(void)
+{
+#define SET(op, type, fn) \
+    ompi_op_rocm_kernel_fns[OMPI_OP_BASE_FORTRAN_##op][OMPI_OP_BASE_TYPE_##type] = (fn)
+
+    SET(MAX, INT8_T,   launch_max_int8);
+    SET(MAX, UINT8_T,  launch_max_uint8);
+    SET(MAX, INT16_T,  launch_max_int16);
+    SET(MAX, UINT16_T, launch_max_uint16);
+    SET(MAX, INT32_T,  launch_max_int32);
+    SET(MAX, UINT32_T, launch_max_uint32);
+    SET(MAX, INT64_T,  launch_max_int64);
+    SET(MAX, UINT64_T, launch_max_uint64);
+    SET(MAX, FLOAT,    launch_max_float);
+    SET(MAX, DOUBLE,   launch_max_double);
+
+    SET(MIN, INT8_T,   launch_min_int8);
+    SET(MIN, UINT8_T,  launch_min_uint8);
+    SET(MIN, INT16_T,  launch_min_int16);
+    SET(MIN, UINT16_T, launch_min_uint16);
+    SET(MIN, INT32_T,  launch_min_int32);
+    SET(MIN, UINT32_T, launch_min_uint32);
+    SET(MIN, INT64_T,  launch_min_int64);
+    SET(MIN, UINT64_T, launch_min_uint64);
+    SET(MIN, FLOAT,    launch_min_float);
+    SET(MIN, DOUBLE,   launch_min_double);
+
+    SET(SUM, INT8_T,   launch_sum_int8);
+    SET(SUM, UINT8_T,  launch_sum_uint8);
+    SET(SUM, INT16_T,  launch_sum_int16);
+    SET(SUM, UINT16_T, launch_sum_uint16);
+    SET(SUM, INT32_T,  launch_sum_int32);
+    SET(SUM, UINT32_T, launch_sum_uint32);
+    SET(SUM, INT64_T,  launch_sum_int64);
+    SET(SUM, UINT64_T, launch_sum_uint64);
+    SET(SUM, FLOAT,    launch_sum_float);
+    SET(SUM, DOUBLE,   launch_sum_double);
+
+    SET(PROD, INT8_T,   launch_prod_int8);
+    SET(PROD, UINT8_T,  launch_prod_uint8);
+    SET(PROD, INT16_T,  launch_prod_int16);
+    SET(PROD, UINT16_T, launch_prod_uint16);
+    SET(PROD, INT32_T,  launch_prod_int32);
+    SET(PROD, UINT32_T, launch_prod_uint32);
+    SET(PROD, INT64_T,  launch_prod_int64);
+    SET(PROD, UINT64_T, launch_prod_uint64);
+    SET(PROD, FLOAT,    launch_prod_float);
+    SET(PROD, DOUBLE,   launch_prod_double);
+
+    SET(BAND, INT8_T,   launch_band_int8);
+    SET(BAND, UINT8_T,  launch_band_uint8);
+    SET(BAND, INT16_T,  launch_band_int16);
+    SET(BAND, UINT16_T, launch_band_uint16);
+    SET(BAND, INT32_T,  launch_band_int32);
+    SET(BAND, UINT32_T, launch_band_uint32);
+    SET(BAND, INT64_T,  launch_band_int64);
+    SET(BAND, UINT64_T, launch_band_uint64);
+
+    SET(BOR, INT8_T,   launch_bor_int8);
+    SET(BOR, UINT8_T,  launch_bor_uint8);
+    SET(BOR, INT16_T,  launch_bor_int16);
+    SET(BOR, UINT16_T, launch_bor_uint16);
+    SET(BOR, INT32_T,  launch_bor_int32);
+    SET(BOR, UINT32_T, launch_bor_uint32);
+    SET(BOR, INT64_T,  launch_bor_int64);
+    SET(BOR, UINT64_T, launch_bor_uint64);
+
+    SET(BXOR, INT8_T,   launch_bxor_int8);
+    SET(BXOR, UINT8_T,  launch_bxor_uint8);
+    SET(BXOR, INT16_T,  launch_bxor_int16);
+    SET(BXOR, UINT16_T, launch_bxor_uint16);
+    SET(BXOR, INT32_T,  launch_bxor_int32);
+    SET(BXOR, UINT32_T, launch_bxor_uint32);
+    SET(BXOR, INT64_T,  launch_bxor_int64);
+    SET(BXOR, UINT64_T, launch_bxor_uint64);
+
+    /* LAND, LOR, LXOR, MAXLOC, MINLOC, REPLACE, NO_OP: NULL → host path */
+#undef SET
+}
diff --git a/ompi/mca/op/rocm/op_rocm_session.c b/ompi/mca/op/rocm/op_rocm_session.c
new file mode 100644
index 00000000000..b445d5fc3f5
--- /dev/null
+++ b/ompi/mca/op/rocm/op_rocm_session.c
@@ -0,0 +1,239 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2025      Amazon.com, Inc. or its affiliates.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+/*
+ * Session lifecycle for the ROCm persistent-kernel op component.
+ * Mirrors op_cuda_session.c with hip* API calls in place of cuda*.
+ *
+ * ompi_op_rocm_cmd_queue_t inherits ompi_op_gpu_cmd_queue_t.  OBJ_NEW
+ * allocates the object; the OBJ destructor releases the HIP stream and
+ * managed memory.  The component returns the base pointer from alloc so
+ * callers need no knowledge of the concrete type.
+ *
+ * session_begin:   look up the kernel for (op, dtype), reset the cmd_queue
+ *                  state, and launch the persistent kernel on the existing
+ *                  stream.  Wires all session dispatch hooks and returns the
+ *                  session.  Returns NULL if no kernel exists.
+ *
+ * session_reduce:  write src/dst/count to the command slot, set status=1
+ *                  to wake the kernel, and spin until status==2.
+ *
+ * session_stop:    signal the persistent kernel to exit and synchronize the
+ *                  stream.  The cmd_queue's HIP stream and managed memory
+ *                  remain allocated for reuse.
+ */
+
+#include "ompi_config.h"
+#include <stdbool.h>
+#include <stdlib.h>
+#include <sched.h>
+
+#include <hip/hip_runtime.h>
+
+#include "opal/mca/accelerator/base/base.h"
+#include "ompi/op/op.h"
+#include "ompi/datatype/ompi_datatype.h"
+#include "ompi/op/op_gpu_session.h"
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/rocm/op_rocm.h"
+
+/* ompi_op_ddt_map[] maps dtype->id → OMPI_OP_BASE_TYPE_* (-1 if none) */
+extern int ompi_op_ddt_map[OMPI_DATATYPE_MAX_PREDEFINED];
+
+/* Forward declarations of static session hooks referenced from session_begin. */
+static void ompi_op_rocm_session_reduce(ompi_op_gpu_session_t *session,
+                                         const void *src1, const void *src2,
+                                         void *dst, size_t count);
+static void ompi_op_rocm_session_stop(ompi_op_gpu_session_t *session);
+
+/* --------------------------------------------------------------------------
+ * OBJ constructor / destructor for ompi_op_rocm_cmd_queue_t
+ * -------------------------------------------------------------------------- */
+static void
+ompi_op_rocm_cmd_queue_construct(ompi_op_rocm_cmd_queue_t *q)
+{
+    q->shutdown       = NULL;
+    q->stream         = NULL;
+    q->super.cmd      = NULL;
+    q->super.dev_id   = -1;
+    q->super.allocator = NULL;
+    q->super.session_begin_fn = NULL;
+}
+
+static void
+ompi_op_rocm_cmd_queue_destruct(ompi_op_rocm_cmd_queue_t *q)
+{
+    if (NULL != q->stream) {
+        hipStreamDestroy(q->stream);
+        q->stream = NULL;
+    }
+    if (NULL != q->shutdown) {
+        hipFree((void *) q->shutdown);
+        q->shutdown = NULL;
+    }
+    if (NULL != q->super.cmd) {
+        hipFree(q->super.cmd);
+        q->super.cmd = NULL;
+    }
+}
+
+OBJ_CLASS_INSTANCE(ompi_op_rocm_cmd_queue_t,
+                   ompi_op_gpu_cmd_queue_t,
+                   ompi_op_rocm_cmd_queue_construct,
+                   ompi_op_rocm_cmd_queue_destruct);
+
+/* --------------------------------------------------------------------------
+ * ompi_op_rocm_cmd_queue_alloc
+ * -------------------------------------------------------------------------- */
+ompi_op_gpu_cmd_queue_t *
+ompi_op_rocm_cmd_queue_alloc(int dev_id)
+{
+    ompi_op_rocm_cmd_queue_t *q = OBJ_NEW(ompi_op_rocm_cmd_queue_t);
+    if (NULL == q) {
+        return NULL;
+    }
+
+    hipError_t err;
+
+    /* Allocate managed-memory command slot (accessible by both CPU and GPU) */
+    err = hipMallocManaged((void **) &q->super.cmd,
+                           sizeof(ompi_op_gpu_cmd_t),
+                           hipMemAttachGlobal);
+    if (hipSuccess != err) {
+        OBJ_RELEASE(q);
+        return NULL;
+    }
+    q->super.cmd->src1   = NULL;
+    q->super.cmd->src2   = NULL;
+    q->super.cmd->dst    = NULL;
+    q->super.cmd->count  = 0;
+    q->super.cmd->status = 0;
+
+    /* Allocate managed-memory shutdown flag */
+    err = hipMallocManaged((void **) &q->shutdown,
+                           sizeof(int32_t),
+                           hipMemAttachGlobal);
+    if (hipSuccess != err) {
+        OBJ_RELEASE(q);
+        return NULL;
+    }
+    *q->shutdown = 0;
+
+    /* Create a dedicated non-blocking stream for this cmd_queue */
+    err = hipStreamCreateWithFlags(&q->stream, hipStreamNonBlocking);
+    if (hipSuccess != err) {
+        OBJ_RELEASE(q);
+        return NULL;
+    }
+
+    q->super.dev_id    = dev_id;
+    q->super.allocator = opal_accelerator_base_get_device_allocator(dev_id);
+    return &q->super;
+}
+
+/* --------------------------------------------------------------------------
+ * ompi_op_rocm_session_begin
+ * -------------------------------------------------------------------------- */
+ompi_op_gpu_session_t *
+ompi_op_rocm_session_begin(ompi_op_gpu_cmd_queue_t *queue,
+                            struct ompi_op_t *op,
+                            struct ompi_datatype_t *dtype)
+{
+    int op_idx   = op->o_f_to_c_index;
+    int type_idx = (dtype->id < OMPI_DATATYPE_MAX_PREDEFINED)
+                   ? ompi_op_ddt_map[dtype->id] : -1;
+
+    if (op_idx  < 0 || op_idx  >= OMPI_OP_BASE_FORTRAN_OP_MAX ||
+        type_idx < 0 || type_idx >= OMPI_OP_BASE_TYPE_MAX) {
+        return NULL;
+    }
+
+    ompi_op_rocm_launcher_fn_t launcher = ompi_op_rocm_kernel_fns[op_idx][type_idx];
+    if (NULL == launcher) {
+        return NULL;
+    }
+
+    ompi_op_rocm_cmd_queue_t *cq = (ompi_op_rocm_cmd_queue_t *) queue;
+
+    /* Reset queue state for the new kernel */
+    *cq->shutdown      = 0;
+    queue->cmd->src1   = NULL;
+    queue->cmd->src2   = NULL;
+    queue->cmd->dst    = NULL;
+    queue->cmd->count  = 0;
+    queue->cmd->status = 0;
+
+    /* Launch the persistent kernel (1 block, 256 threads) */
+    launcher(queue->cmd, cq->shutdown, cq->stream);
+    hipError_t err = hipGetLastError();
+    if (hipSuccess != err) {
+        return NULL;
+    }
+
+    ompi_op_gpu_session_t *session =
+        (ompi_op_gpu_session_t *) malloc(sizeof(ompi_op_gpu_session_t));
+    if (NULL == session) {
+        return NULL;
+    }
+
+    session->queue     = queue;
+    session->allocator = queue->allocator;
+    session->reduce_fn = ompi_op_rocm_session_reduce;
+    session->stop_fn   = ompi_op_rocm_session_stop;
+    return session;
+}
+
+/* --------------------------------------------------------------------------
+ * ompi_op_rocm_session_reduce
+ * -------------------------------------------------------------------------- */
+static void
+ompi_op_rocm_session_reduce(ompi_op_gpu_session_t *session,
+                             const void *src1, const void *src2,
+                             void *dst, size_t count)
+{
+    ompi_op_gpu_cmd_t *cmd = session->queue->cmd;
+
+    /* Write operands before signalling the kernel */
+    cmd->src1  = src1;
+    cmd->src2  = src2;
+    cmd->dst   = dst;
+    cmd->count = (int64_t) count;
+
+    __atomic_thread_fence(__ATOMIC_SEQ_CST);   /* ensure writes visible to GPU */
+    cmd->status = 1;                           /* wake the kernel */
+
+    /* Spin-wait for the kernel to signal completion */
+    while (2 != cmd->status) {
+        sched_yield();   /* relinquish CPU timeslice while waiting */
+    }
+
+    /* Reset for the next call */
+    cmd->status = 0;
+}
+
+/* --------------------------------------------------------------------------
+ * ompi_op_rocm_session_stop
+ *
+ * Signal the persistent kernel to exit and wait for the stream to drain.
+ * The cmd_queue's stream and managed memory remain allocated for reuse.
+ * -------------------------------------------------------------------------- */
+static void
+ompi_op_rocm_session_stop(ompi_op_gpu_session_t *session)
+{
+    ompi_op_rocm_cmd_queue_t *cq = (ompi_op_rocm_cmd_queue_t *) session->queue;
+
+    /* Signal the kernel to exit its loop */
+    *cq->shutdown = 1;
+    __atomic_thread_fence(__ATOMIC_SEQ_CST);
+
+    /* Wait for the kernel to finish; stream remains valid after this */
+    hipStreamSynchronize(cq->stream);
+}
diff --git a/ompi/op/Makefile.am b/ompi/op/Makefile.am
index 5599c31311b..b86bb1a3965 100644
--- a/ompi/op/Makefile.am
+++ b/ompi/op/Makefile.am
@@ -23,5 +23,7 @@
 # ompi/Makefile.am
 
 headers += op/op.h
+headers += op/op_gpu_session.h
 
 lib@OMPI_LIBMPI_NAME@_la_SOURCES += op/op.c
+lib@OMPI_LIBMPI_NAME@_la_SOURCES += op/op_gpu_session.c
diff --git a/ompi/op/op_gpu_session.c b/ompi/op/op_gpu_session.c
new file mode 100644
index 00000000000..f9217764e1f
--- /dev/null
+++ b/ompi/op/op_gpu_session.c
@@ -0,0 +1,226 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2025      Amazon.com, Inc. or its affiliates.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+/*
+ * Dispatcher and cmd_queue pool for GPU reduction sessions.
+ *
+ * The expensive GPU resources — managed-memory command slot, shutdown flag,
+ * and private GPU stream — are bundled into an ompi_op_gpu_cmd_queue_t and
+ * pooled by dev_id.  Sessions themselves are lightweight structs (function
+ * pointers + a pointer to the cmd_queue) and are allocated fresh for each
+ * collective.
+ *
+ * Pool implementation:
+ *   cmd_queue_pool  — opal_lifo_t providing lock-free thread-safe push/pop.
+ *   cmd_queue_pool_count — atomic counter tracking current pool depth;
+ *                          used to enforce CMD_QUEUE_POOL_MAX without a mutex.
+ *
+ * Pool lifecycle:
+ *   session_end()   — stops the persistent kernel (cmd_queue resources remain
+ *                     allocated), then pushes the cmd_queue into the lifo pool.
+ *   session_begin() — pops from the lifo looking for a matching dev_id entry
+ *                     and calls queue->session_begin_fn(queue, op, dtype) to
+ *                     configure and relaunch the kernel; no cudaMalloc overhead.
+ *                     On pool miss, iterates op components to allocate a fresh
+ *                     cmd_queue, then calls session_begin.
+ *                     On pool hit with no matching dev_id, the queue is pushed
+ *                     back and a fresh allocation is attempted.
+ *                     On pool hit with no kernel for (op, dtype), the queue is
+ *                     returned to the pool and NULL is returned.
+ *
+ * CMD_QUEUE_POOL_MAX caps the total number of idle cmd_queues to bound GPU
+ * resource accumulation.
+ */
+
+#include "ompi_config.h"
+
+#include <stdlib.h>
+
+#include "opal/class/opal_lifo.h"
+#include "opal/class/opal_list.h"
+#include "opal/mca/accelerator/base/base.h"
+#include "opal/mca/base/base.h"
+#include "opal/sys/atomic.h"
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/base/base.h"
+#include "ompi/op/op_gpu_session.h"
+#include "ompi/op/op.h"
+
+OBJ_CLASS_INSTANCE(ompi_op_gpu_cmd_queue_t, opal_list_item_t, NULL, NULL);
+
+/* Maximum number of idle cmd_queues kept in the pool. */
+#define CMD_QUEUE_POOL_MAX 16
+
+static opal_lifo_t          cmd_queue_pool;
+static opal_atomic_int32_t  cmd_queue_pool_count = 0;
+
+/* --------------------------------------------------------------------------
+ * cmd_queue_destroy — permanently release a cmd_queue's GPU resources.
+ * OBJ_RELEASE dispatches through the concrete class destructor chain
+ * (e.g. ompi_op_cuda_cmd_queue_t) and frees the allocation.
+ * -------------------------------------------------------------------------- */
+static void
+cmd_queue_destroy(ompi_op_gpu_cmd_queue_t *queue)
+{
+    OBJ_RELEASE(queue);
+}
+
+/* --------------------------------------------------------------------------
+ * cmd_queue_pool_push — return a cmd_queue to the pool.
+ * Destroys the queue instead if the pool is already at capacity.
+ * -------------------------------------------------------------------------- */
+static void
+cmd_queue_pool_push(ompi_op_gpu_cmd_queue_t *queue)
+{
+    if (opal_atomic_add_fetch_32(&cmd_queue_pool_count, 1) <= CMD_QUEUE_POOL_MAX) {
+        opal_lifo_push(&cmd_queue_pool, &queue->super);
+    } else {
+        opal_atomic_add_fetch_32(&cmd_queue_pool_count, -1);
+        cmd_queue_destroy(queue);
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * ompi_op_gpu_session_pool_init
+ * -------------------------------------------------------------------------- */
+void
+ompi_op_gpu_session_pool_init(void)
+{
+    OBJ_CONSTRUCT(&cmd_queue_pool, opal_lifo_t);
+}
+
+/* --------------------------------------------------------------------------
+ * ompi_op_gpu_session_begin
+ *
+ * 1. Pop one entry from the lifo pool.
+ * 2. If dev_id matches: call queue->session_begin_fn to configure and
+ *    relaunch the kernel.  On success return the session.  On failure
+ *    (no kernel for this op/dtype), push the queue back and return NULL.
+ * 3. If dev_id doesn't match: push the queue back and fall through to
+ *    fresh allocation.
+ * 4. Pool miss: iterate op components to allocate a fresh cmd_queue and
+ *    call opc_session_begin.
+ * -------------------------------------------------------------------------- */
+ompi_op_gpu_session_t *
+ompi_op_gpu_session_begin(struct ompi_op_t *op,
+                          struct ompi_datatype_t *dtype,
+                          int dev_id)
+{
+    /* Check pool for a reusable cmd_queue. */
+    opal_list_item_t *item = opal_lifo_pop(&cmd_queue_pool);
+    if (NULL != item) {
+        opal_atomic_add_fetch_32(&cmd_queue_pool_count, -1);
+        ompi_op_gpu_cmd_queue_t *q = (ompi_op_gpu_cmd_queue_t *) item;
+
+        if (q->dev_id == dev_id) {
+            ompi_op_gpu_session_t *s = q->session_begin_fn(q, op, dtype);
+            if (NULL != s) {
+                return s;
+            }
+            /* No GPU kernel for this (op, dtype).  Return the cmd_queue to
+             * the pool so it can be reused for a future combination that does
+             * have a kernel.  Caller falls back to ompi_op_reduce(). */
+            cmd_queue_pool_push(q);
+            return NULL;
+        }
+
+        /* Wrong device — push back and fall through to fresh allocation. */
+        cmd_queue_pool_push(q);
+    }
+
+    /* Pool miss (or wrong device) — allocate a fresh cmd_queue. */
+    mca_base_component_list_item_t *cli;
+    OPAL_LIST_FOREACH(cli, &ompi_op_base_framework.framework_components,
+                      mca_base_component_list_item_t) {
+        const mca_base_component_t *bc = cli->cli_component;
+
+        if (1 != bc->mca_type_major_version ||
+            0 != bc->mca_type_minor_version ||
+            0 != bc->mca_type_release_version) {
+            continue;
+        }
+
+        const ompi_op_base_component_1_0_0_t *opc =
+            (const ompi_op_base_component_1_0_0_t *) bc;
+
+        if (NULL == opc->opc_cmd_queue_alloc ||
+            NULL == opc->opc_session_begin) {
+            continue;
+        }
+
+        ompi_op_gpu_cmd_queue_t *q = opc->opc_cmd_queue_alloc(dev_id);
+        if (NULL == q) {
+            continue;
+        }
+
+        /* Wire session_begin_fn into the cmd_queue. */
+        q->session_begin_fn = opc->opc_session_begin;
+
+        ompi_op_gpu_session_t *session = opc->opc_session_begin(q, op, dtype);
+        if (NULL == session) {
+            /* This component has no kernel for (op, dtype); discard the queue. */
+            cmd_queue_destroy(q);
+            continue;
+        }
+
+        return session;
+    }
+
+    return NULL;
+}
+
+/* --------------------------------------------------------------------------
+ * ompi_op_gpu_session_reduce
+ * -------------------------------------------------------------------------- */
+void
+ompi_op_gpu_session_reduce(ompi_op_gpu_session_t *session,
+                           const void *src1, const void *src2,
+                           void *dst, size_t count)
+{
+    session->reduce_fn(session, src1, src2, dst, count);
+}
+
+/* --------------------------------------------------------------------------
+ * ompi_op_gpu_session_end
+ *
+ * Stop the persistent kernel and return the cmd_queue to the pool.
+ * -------------------------------------------------------------------------- */
+void
+ompi_op_gpu_session_end(ompi_op_gpu_session_t *session)
+{
+    if (NULL == session) {
+        return;
+    }
+
+    /* Signal the kernel to exit and wait for the stream to drain. */
+    session->stop_fn(session);
+
+    ompi_op_gpu_cmd_queue_t *q = session->queue;
+    free(session);
+
+    cmd_queue_pool_push(q);
+}
+
+/* --------------------------------------------------------------------------
+ * ompi_op_gpu_session_pool_finalize
+ *
+ * Drain the pool, release all GPU resources, and destroy the lifo.
+ * Called once from ompi_op_base_close() during MPI_Finalize.
+ * -------------------------------------------------------------------------- */
+void
+ompi_op_gpu_session_pool_finalize(void)
+{
+    opal_list_item_t *item;
+    while (NULL != (item = opal_lifo_pop(&cmd_queue_pool))) {
+        cmd_queue_destroy((ompi_op_gpu_cmd_queue_t *) item);
+    }
+    OBJ_DESTRUCT(&cmd_queue_pool);
+}
diff --git a/ompi/op/op_gpu_session.h b/ompi/op/op_gpu_session.h
new file mode 100644
index 00000000000..36cba08fefb
--- /dev/null
+++ b/ompi/op/op_gpu_session.h
@@ -0,0 +1,135 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2025      Amazon.com, Inc. or its affiliates.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#ifndef OMPI_OP_GPU_SESSION_H
+#define OMPI_OP_GPU_SESSION_H
+
+#include "ompi_config.h"
+#include <stdbool.h>
+#include <stdint.h>
+#include "opal/class/opal_list.h"
+#include "opal/mca/allocator/allocator.h"
+
+BEGIN_C_DECLS
+
+struct ompi_op_t;
+struct ompi_datatype_t;
+
+/**
+ * Managed-memory command slot shared between the host and the persistent
+ * reduction kernel (accessible by both CPU and GPU via managed/unified memory).
+ *
+ * status lifecycle (per reduction call):
+ *   0 = idle       (initial; host resets after kernel signals done)
+ *   1 = work_ready (host → kernel: pointers and count are valid)
+ *   2 = done       (kernel → host: reduction complete)
+ */
+typedef struct {
+    const void      *src1;
+    const void      *src2;
+    void            *dst;
+    int64_t          count;
+    volatile int32_t status;
+} ompi_op_gpu_cmd_t;
+
+/**
+ * The expensive-to-create GPU resources needed by a persistent reduction
+ * kernel: a managed-memory command slot and a private GPU stream.  Pooled
+ * by dev_id so they can be reused across collectives without paying
+ * cudaMallocManaged/hipMallocManaged overhead on every call.
+ *
+ * GPU components (cuda, rocm) inherit from this base by placing it as the
+ * first member named "super" in their own cmd_queue struct, then allocate
+ * with OBJ_NEW and return the base pointer.  Destruction (including GPU
+ * resource cleanup) is dispatched automatically through the OBJ class chain.
+ *
+ * session_begin_fn is wired at cmd_queue_alloc time by the component.
+ */
+typedef struct ompi_op_gpu_cmd_queue_t {
+    opal_list_item_t             super;       /* MUST be first: used by opal_lifo_t pool */
+    int                          dev_id;
+    mca_allocator_base_module_t *allocator;  /* GPU scratch allocator for this device */
+    ompi_op_gpu_cmd_t           *cmd;        /* managed memory — shared with GPU */
+    /* Session creation hook — wired at cmd_queue_alloc time by the component. */
+    struct ompi_op_gpu_session_t *(*session_begin_fn)(
+        struct ompi_op_gpu_cmd_queue_t *queue,
+        struct ompi_op_t *op,
+        struct ompi_datatype_t *dtype);
+} ompi_op_gpu_cmd_queue_t;
+OBJ_CLASS_DECLARATION(ompi_op_gpu_cmd_queue_t);
+
+/**
+ * Per-collective GPU reduction session.  Created by ompi_op_gpu_session_begin()
+ * before a collective algorithm's reduction loop, and destroyed (with its
+ * cmd_queue recycled to the pool) by ompi_op_gpu_session_end().
+ *
+ * Sessions are lightweight: all expensive GPU resources (managed memory,
+ * GPU stream) live in the cmd_queue, which is pooled separately.  The session
+ * holds only a pointer to the cmd_queue and the dispatch function pointers.
+ *
+ * The component's opc_session_begin wires queue, allocator, reduce_fn, and
+ * stop_fn.  Callers must not set these fields directly.
+ *
+ * When no GPU op component supports the (op, dtype) combination, begin()
+ * returns NULL and all callers fall back to ompi_op_reduce().
+ */
+typedef struct ompi_op_gpu_session_t {
+    ompi_op_gpu_cmd_queue_t     *queue;
+    mca_allocator_base_module_t *allocator;  /* GPU scratch allocator (= queue->allocator) */
+    /* Dispatch hooks wired by the component's opc_session_begin. */
+    void (*reduce_fn)(struct ompi_op_gpu_session_t *session,
+                      const void *src1, const void *src2, void *dst, size_t count);
+    /* Signal the persistent kernel to exit and synchronize the stream.
+     * The cmd_queue's resources remain valid for reuse after this call. */
+    void (*stop_fn)(struct ompi_op_gpu_session_t *session);
+} ompi_op_gpu_session_t;
+
+/**
+ * Create a GPU reduction session and launch a persistent reduction kernel.
+ * Returns NULL if no GPU op component supports this (op, dtype) combination
+ * or if no GPU op component is loaded — the caller must then use ompi_op_reduce.
+ */
+OMPI_DECLSPEC ompi_op_gpu_session_t *ompi_op_gpu_session_begin(struct ompi_op_t *op,
+                                                                struct ompi_datatype_t *dtype,
+                                                                int dev_id);
+
+/**
+ * Post one reduction command (src1 op src2 → dst) to the persistent kernel and
+ * wait for completion.  src2 may alias dst for in-place operations.
+ * Behavior is undefined if session is NULL.
+ */
+OMPI_DECLSPEC void ompi_op_gpu_session_reduce(ompi_op_gpu_session_t *session,
+                                               const void *src1, const void *src2,
+                                               void *dst, size_t count);
+
+/**
+ * Stop the persistent kernel and return the session's cmd_queue to the pool
+ * for reuse.  GPU stream and managed memory remain allocated; a future begin()
+ * call for the same dev_id will relaunch the kernel without allocating new
+ * resources.  NULL-safe.
+ */
+OMPI_DECLSPEC void ompi_op_gpu_session_end(ompi_op_gpu_session_t *session);
+
+/**
+ * Initialize the cmd_queue pool.  Must be called once before any session
+ * operations (from ompi_op_base_open via the framework open hook).
+ */
+OMPI_DECLSPEC void ompi_op_gpu_session_pool_init(void);
+
+/**
+ * Drain and permanently destroy all pooled cmd_queues.  Must be called once
+ * during MPI finalization (from ompi_op_base_close).
+ */
+OMPI_DECLSPEC void ompi_op_gpu_session_pool_finalize(void);
+
+END_C_DECLS
+
+#endif /* OMPI_OP_GPU_SESSION_H */
diff --git a/opal/mca/accelerator/base/accelerator_base_frame.c b/opal/mca/accelerator/base/accelerator_base_frame.c
index 55e13f3773b..bbcfc84b37c 100644
--- a/opal/mca/accelerator/base/accelerator_base_frame.c
+++ b/opal/mca/accelerator/base/accelerator_base_frame.c
@@ -18,6 +18,9 @@
 #include "opal/mca/accelerator/base/base.h"
 #include "opal/mca/base/base.h"
 #include "opal/mca/mca.h"
+#include "opal/mca/allocator/allocator.h"
+#include "opal/mca/allocator/basic/allocator_basic.h"
+#include "opal/mca/threads/mutex.h"
 
 /*
  * The following file was created by configure.  It contains extern
@@ -30,6 +33,134 @@
 opal_accelerator_base_module_t opal_accelerator = {0};
 opal_accelerator_base_component_t opal_accelerator_base_selected_component = {{0}};
 
+/* Per-device allocator pool — allocated lazily to num_devices on first use. */
+static mca_allocator_base_module_t **opal_accel_device_allocators = NULL;
+static int opal_accel_num_devices = 0;
+static opal_mutex_t opal_accel_alloc_lock = OPAL_MUTEX_STATIC_INIT;
+
+/*
+ * Tracks a single GPU segment returned by opal_accelerator.mem_alloc so it
+ * can be released on cleanup.  The basic allocator never calls seg_free during
+ * normal operation (only compact/finalize would, and compact is a no-op), so
+ * we keep our own list instead of relying on it.
+ */
+struct opal_accel_alloc_seg_t {
+    opal_list_item_t super;
+    void *ptr;
+};
+typedef struct opal_accel_alloc_seg_t opal_accel_alloc_seg_t;
+OBJ_CLASS_INSTANCE(opal_accel_alloc_seg_t, opal_list_item_t, NULL, NULL);
+
+typedef struct {
+    int dev_id;
+    opal_list_t segs; /* every GPU segment allocated via seg_alloc */
+} opal_accel_alloc_ctx_t;
+
+/*
+ * seg_alloc is called (under the basic allocator's internal lock) whenever the
+ * free list has no block large enough.  Record each new GPU segment so it can
+ * be released on cleanup.
+ */
+static void *opal_accel_seg_alloc(void *ctx, size_t *size)
+{
+    opal_accel_alloc_ctx_t *ac = (opal_accel_alloc_ctx_t *) ctx;
+    opal_accel_alloc_seg_t *seg;
+    void *ptr = NULL;
+
+    if (OPAL_SUCCESS != opal_accelerator.mem_alloc(ac->dev_id, &ptr, *size)) {
+        return NULL;
+    }
+
+    seg = OBJ_NEW(opal_accel_alloc_seg_t);
+    if (OPAL_LIKELY(NULL != seg)) {
+        seg->ptr = ptr;
+        opal_list_append(&ac->segs, &seg->super);
+    }
+    return ptr;
+}
+
+/* seg_free is wired into the allocator API but never invoked during normal
+ * operation (basic allocator compact is a no-op).  Cleanup is handled
+ * explicitly in opal_accelerator_base_frame_close via the segs list. */
+static void opal_accel_seg_free(void *ctx, void *seg)
+{
+    (void) ctx;
+    (void) seg;
+}
+
+mca_allocator_base_module_t *
+opal_accelerator_base_get_device_allocator(int dev_id)
+{
+    mca_allocator_base_module_t *alloc;
+    opal_accel_alloc_ctx_t *ctx;
+
+    if (dev_id < 0 || NULL == opal_accelerator.mem_alloc) {
+        return NULL;
+    }
+
+    /* Fast path: array already sized and slot already filled. */
+    if (NULL != opal_accel_device_allocators
+        && dev_id < opal_accel_num_devices
+        && NULL != opal_accel_device_allocators[dev_id]) {
+        return opal_accel_device_allocators[dev_id];
+    }
+
+    OPAL_THREAD_LOCK(&opal_accel_alloc_lock);
+
+    /* Lazily allocate the per-device array on first call. */
+    if (NULL == opal_accel_device_allocators) {
+        int num_devices = 0;
+        if (OPAL_SUCCESS != opal_accelerator.num_devices(&num_devices) || num_devices <= 0) {
+            OPAL_THREAD_UNLOCK(&opal_accel_alloc_lock);
+            return NULL;
+        }
+        opal_accel_device_allocators = calloc(num_devices,
+                                              sizeof(*opal_accel_device_allocators));
+        if (NULL == opal_accel_device_allocators) {
+            OPAL_THREAD_UNLOCK(&opal_accel_alloc_lock);
+            return NULL;
+        }
+        opal_accel_num_devices = num_devices;
+    }
+
+    if (dev_id >= opal_accel_num_devices) {
+        OPAL_THREAD_UNLOCK(&opal_accel_alloc_lock);
+        return NULL;
+    }
+
+    if (NULL == opal_accel_device_allocators[dev_id]) {
+        ctx = (opal_accel_alloc_ctx_t *) malloc(sizeof(*ctx));
+        if (NULL == ctx) {
+            OPAL_THREAD_UNLOCK(&opal_accel_alloc_lock);
+            return NULL;
+        }
+        ctx->dev_id = dev_id;
+        OBJ_CONSTRUCT(&ctx->segs, opal_list_t);
+        /*
+         * Use the basic (first-fit + coalescing) allocator rather than the
+         * bucket allocator.  When a large block is freed it can be split to
+         * serve a smaller future request, and adjacent free blocks are merged
+         * back together, giving good reuse across the varying scratch-buffer
+         * sizes produced by collective algorithms.  GPU segments are retained
+         * in the free list for the lifetime of the process; the GPU driver
+         * reclaims device memory on context teardown.
+         */
+        alloc = mca_allocator_basic_component_init(true,
+                                                   opal_accel_seg_alloc,
+                                                   opal_accel_seg_free,
+                                                   ctx);
+        if (NULL == alloc) {
+            free(ctx);
+            OPAL_THREAD_UNLOCK(&opal_accel_alloc_lock);
+            return NULL;
+        }
+        opal_accel_device_allocators[dev_id] = alloc;
+    }
+
+    OPAL_THREAD_UNLOCK(&opal_accel_alloc_lock);
+    return opal_accel_device_allocators[dev_id];
+}
+
 static int opal_accelerator_base_frame_register(mca_base_register_flag_t flags)
 {
     return OPAL_SUCCESS;
@@ -37,6 +168,31 @@ static int opal_accelerator_base_frame_register(mca_base_register_flag_t flags)
 
 static int opal_accelerator_base_frame_close(void)
 {
+    if (NULL != opal_accel_device_allocators) {
+        for (int i = 0; i < opal_accel_num_devices; i++) {
+            if (NULL != opal_accel_device_allocators[i]) {
+                opal_accel_alloc_ctx_t *ctx =
+                    (opal_accel_alloc_ctx_t *) opal_accel_device_allocators[i]->alc_context;
+                opal_accel_alloc_seg_t *seg;
+
+                /* Release all GPU segments tracked in seg_alloc before the
+                 * basic allocator frees its internal structures. */
+                while (NULL != (seg = (opal_accel_alloc_seg_t *)
+                                      opal_list_remove_first(&ctx->segs))) {
+                    opal_accelerator.mem_release(ctx->dev_id, seg->ptr);
+                    OBJ_RELEASE(seg);
+                }
+                OBJ_DESTRUCT(&ctx->segs);
+
+                opal_accel_device_allocators[i]->alc_finalize(opal_accel_device_allocators[i]);
+                free(ctx);
+                opal_accel_device_allocators[i] = NULL;
+            }
+        }
+        free(opal_accel_device_allocators);
+        opal_accel_device_allocators = NULL;
+        opal_accel_num_devices = 0;
+    }
     return mca_base_framework_components_close(&opal_accelerator_base_framework, NULL);
 }
 
diff --git a/opal/mca/accelerator/base/base.h b/opal/mca/accelerator/base/base.h
index e5922032ea8..94892d15c90 100644
--- a/opal/mca/accelerator/base/base.h
+++ b/opal/mca/accelerator/base/base.h
@@ -20,6 +20,7 @@
 #include "opal/mca/accelerator/accelerator.h"
 #include "opal/mca/base/mca_base_framework.h"
 #include "opal/mca/mca.h"
+#include "opal/mca/allocator/allocator.h"
 
 
 BEGIN_C_DECLS
@@ -33,6 +34,14 @@ OPAL_DECLSPEC int opal_accelerator_base_select(void);
 
 OPAL_DECLSPEC extern opal_accelerator_base_component_t opal_accelerator_base_selected_component;
 
+/**
+ * Return a pooled allocator for device memory on the given device.
+ * Created lazily and cached for the lifetime of the process.
+ * Returns NULL if no accelerator is available or dev_id is invalid.
+ */
+OPAL_DECLSPEC mca_allocator_base_module_t *
+opal_accelerator_base_get_device_allocator(int dev_id);
+
 END_C_DECLS
 
 #endif