From 405bb91bff72ede371e781bf74b48e5ad9489575 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 27 Mar 2025 18:25:28 +0100
Subject: [PATCH 01/14] add rocsparse spgemm

---
 include/spblas/vendor/rocsparse/multiply.hpp  |   2 +-
 .../vendor/rocsparse/multiply_spgemm.hpp      | 326 ++++++++++++++++++
 include/spblas/vendor/rocsparse/rocsparse.hpp |   1 +
 3 files changed, 328 insertions(+), 1 deletion(-)
 create mode 100644 include/spblas/vendor/rocsparse/multiply_spgemm.hpp
diff --git a/include/spblas/vendor/rocsparse/multiply.hpp b/include/spblas/vendor/rocsparse/multiply.hpp
index f276fbf..7d88c5c 100644
--- a/include/spblas/vendor/rocsparse/multiply.hpp
+++ b/include/spblas/vendor/rocsparse/multiply.hpp
@@ -86,7 +86,7 @@ class spmv_state_t {
     // only allocate the new workspace when the requiring workspace larger than
     // current
     if (buffer_size > this->buffer_size_) {
-      this->alloc_.deallocate(this->workspace_, buffer_size_);
+      this->alloc_.deallocate(this->workspace_, this->buffer_size_);
       this->buffer_size_ = buffer_size;
       this->workspace_ = this->alloc_.allocate(buffer_size);
     }
diff --git a/include/spblas/vendor/rocsparse/multiply_spgemm.hpp b/include/spblas/vendor/rocsparse/multiply_spgemm.hpp
new file mode 100644
index 0000000..7f215df
--- /dev/null
+++ b/include/spblas/vendor/rocsparse/multiply_spgemm.hpp
@@ -0,0 +1,326 @@
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <type_traits>
+
+#include <hip/hip_runtime.h>
+#include <rocsparse/rocsparse.h>
+
+#include <spblas/detail/ranges.hpp>
+#include <spblas/detail/view_inspectors.hpp>
+
+#include "exception.hpp"
+#include "hip_allocator.hpp"
+#include "types.hpp"
+
+namespace spblas {
+namespace __rocsparse {
+
+template <typename T>
+T create_null_matrix() {
+  return {nullptr, nullptr, nullptr, index<index_t>{0, 0}, 0};
+}
+
+} // namespace __rocsparse
+
+class spgemm_state_t {
+public:
+  spgemm_state_t() : spgemm_state_t(rocsparse::hip_allocator<char>{}) {}
+
+  spgemm_state_t(rocsparse::hip_allocator<char> alloc)
+      : alloc_(alloc), buffer_size_(0), workspace_(nullptr), result_nnz_(0),
+        result_shape_(0, 0) {
+    rocsparse_handle handle;
+    __rocsparse::throw_if_error(rocsparse_create_handle(&handle));
+    if (auto stream = alloc.stream()) {
+      rocsparse_set_stream(handle, stream);
+    }
+    handle_ = handle_manager(handle, [](rocsparse_handle handle) {
+      __rocsparse::throw_if_error(rocsparse_destroy_handle(handle));
+    });
+  }
+
+  spgemm_state_t(rocsparse::hip_allocator<char> alloc, rocsparse_handle handle)
+      : alloc_(alloc), buffer_size_(0), workspace_(nullptr), result_nnz_(0),
+        result_shape_(0, 0) {
+    handle_ = handle_manager(handle, [](rocsparse_handle handle) {
+      // it is provided by user, we do not delete it at all.
+    });
+  }
+
+  ~spgemm_state_t() {
+    alloc_.deallocate(this->workspace_, this->buffer_size_);
+    __rocsparse::throw_if_error(rocsparse_destroy_spmat_descr(this->mat_a_));
+    __rocsparse::throw_if_error(rocsparse_destroy_spmat_descr(this->mat_b_));
+    __rocsparse::throw_if_error(rocsparse_destroy_spmat_descr(this->mat_c_));
+    __rocsparse::throw_if_error(rocsparse_destroy_spmat_descr(this->mat_d_));
+  }
+
+  auto result_shape() {
+    return this->result_shape_;
+  }
+
+  auto result_nnz() {
+    return this->result_nnz_;
+  }
+
+  template <matrix A, matrix B, matrix C, matrix D>
+    requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+             __detail::is_csr_view_v<C> && __detail::has_csr_base<D>
+  void multiply_compute(A&& a, B&& b, C&& c, D&& d) {
+    auto a_base = __detail::get_ultimate_base(a);
+    auto b_base = __detail::get_ultimate_base(b);
+    auto d_base = __detail::get_ultimate_base(d);
+    using matrix_type = decltype(a_base);
+    using input_type = decltype(b_base);
+    using output_type = std::remove_reference_t<decltype(c)>;
+    using value_type = typename matrix_type::scalar_type;
+
+    size_t buffer_size = 0;
+
+    auto alpha_optional = __detail::get_scaling_factor(a, b);
+    value_type alpha = alpha_optional.value_or(1);
+    auto beta_optional = __detail::get_scaling_factor(d);
+    value_type beta = beta_optional.value_or(1);
+    auto handle = this->handle_.get();
+    // Create sparse matrix A in CSR format
+    __rocsparse::throw_if_error(rocsparse_create_csr_descr(
+        &this->mat_a_, __backend::shape(a_base)[0], __backend::shape(a_base)[1],
+        a_base.values().size(), a_base.rowptr().data(), a_base.colind().data(),
+        a_base.values().data(),
+        to_rocsparse_indextype<typename matrix_type::offset_type>(),
+        to_rocsparse_indextype<typename matrix_type::index_type>(),
+        rocsparse_index_base_zero, to_rocsparse_datatype<value_type>()));
+    __rocsparse::throw_if_error(rocsparse_create_csr_descr(
+        &this->mat_b_, __backend::shape(b_base)[0], __backend::shape(b_base)[1],
+        b_base.values().size(), b_base.rowptr().data(), b_base.colind().data(),
+        b_base.values().data(),
+        to_rocsparse_indextype<typename input_type::offset_type>(),
+        to_rocsparse_indextype<typename input_type::index_type>(),
+        rocsparse_index_base_zero,
+        to_rocsparse_datatype<typename input_type::scalar_type>()));
+    __rocsparse::throw_if_error(rocsparse_create_csr_descr(
+        &this->mat_c_, __backend::shape(a_base)[0], __backend::shape(b_base)[1],
+        0, c.rowptr().data(), NULL, NULL,
+        to_rocsparse_indextype<typename output_type::offset_type>(),
+        to_rocsparse_indextype<typename output_type::index_type>(),
+        rocsparse_index_base_zero,
+        to_rocsparse_datatype<typename output_type::scalar_type>()));
+    __rocsparse::throw_if_error(rocsparse_create_csr_descr(
+        &this->mat_d_, __backend::shape(d_base)[0], __backend::shape(d_base)[1],
+        d_base.values().size(), d_base.rowptr().data(), d_base.colind().data(),
+        d_base.values().data(),
+        to_rocsparse_indextype<typename output_type::offset_type>(),
+        to_rocsparse_indextype<typename output_type::index_type>(),
+        rocsparse_index_base_zero,
+        to_rocsparse_datatype<typename output_type::scalar_type>()));
+    //--------------------------------------------------------------------------
+    // SpGEMM Computation
+    // ask buffer_size bytes for external memory
+    __rocsparse::throw_if_error(rocsparse_spgemm(
+        handle, rocsparse_operation_none, rocsparse_operation_none, &alpha,
+        this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_,
+        to_rocsparse_datatype<value_type>(), rocsparse_spgemm_alg_default,
+        rocsparse_spgemm_stage_buffer_size, &buffer_size, nullptr));
+    if (buffer_size > this->buffer_size_) {
+      this->alloc_.deallocate(workspace_, this->buffer_size_);
+      this->buffer_size_ = buffer_size;
+      workspace_ = this->alloc_.allocate(buffer_size);
+    }
+    __rocsparse::throw_if_error(rocsparse_spgemm(
+        handle, rocsparse_operation_none, rocsparse_operation_none, &alpha,
+        this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_,
+        to_rocsparse_datatype<value_type>(), rocsparse_spgemm_alg_default,
+        rocsparse_spgemm_stage_nnz, &this->buffer_size_, this->workspace_));
+    // get matrix C non-zero entries and size
+    int64_t c_num_rows;
+    int64_t c_num_cols;
+    __rocsparse::throw_if_error(rocsparse_spmat_get_size(
+        this->mat_c_, &c_num_rows, &c_num_cols, &this->result_nnz_));
+    this->result_shape_ = index<index_t>(c_num_rows, c_num_cols);
+  }
+
+  template <matrix A, matrix B, matrix C, matrix D>
+    requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+             __detail::is_csr_view_v<C> && __detail::has_csr_base<D>
+  void multiply_fill(A&& a, B&& b, C&& c, D&& d) {
+    auto a_base = __detail::get_ultimate_base(a);
+    auto b_base = __detail::get_ultimate_base(b);
+    using matrix_type = decltype(a_base);
+    using input_type = decltype(b_base);
+    using output_type = std::remove_reference_t<decltype(c)>;
+    using value_type = typename matrix_type::scalar_type;
+
+    auto alpha_optional = __detail::get_scaling_factor(a, b);
+    tensor_scalar_t<A> alpha = alpha_optional.value_or(1);
+    value_type alpha_val = alpha;
+    auto beta_optional = __detail::get_scaling_factor(d);
+    value_type beta = beta_optional.value_or(1);
+
+    __rocsparse::throw_if_error(rocsparse_csr_set_pointers(
+        this->mat_c_, c.rowptr().data(), c.colind().data(), c.values().data()));
+
+    __rocsparse::throw_if_error(rocsparse_spgemm(
+        handle_.get(), rocsparse_operation_none, rocsparse_operation_none,
+        &alpha, this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_,
+        to_rocsparse_datatype<value_type>(), rocsparse_spgemm_alg_default,
+        rocsparse_spgemm_stage_compute, &this->buffer_size_, workspace_));
+  }
+
+  template <matrix A, matrix B, matrix C, matrix D>
+    requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+             __detail::is_csr_view_v<C> && __detail::has_csr_base<D>
+  void multiply_symbolic_fill(A&& a, B&& b, C&& c, D&& d) {
+    auto a_base = __detail::get_ultimate_base(a);
+    auto b_base = __detail::get_ultimate_base(b);
+    auto d_base = __detail::get_ultimate_base(d);
+    using matrix_type = decltype(a_base);
+    using input_type = decltype(b_base);
+    using output_type = std::remove_reference_t<decltype(c)>;
+    using value_type = typename matrix_type::scalar_type;
+
+    auto alpha_optional = __detail::get_scaling_factor(a, b);
+    value_type alpha = alpha_optional.value_or(1);
+    auto beta_optional = __detail::get_scaling_factor(d);
+    value_type beta = beta_optional.value_or(1);
+
+    __rocsparse::throw_if_error(rocsparse_csr_set_pointers(
+        this->mat_c_, c.rowptr().data(), c.colind().data(), c.values().data()));
+
+    __rocsparse::throw_if_error(rocsparse_spgemm(
+        this->handle_.get(), rocsparse_operation_none, rocsparse_operation_none,
+        &alpha, this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_,
+        to_rocsparse_datatype<value_type>(), rocsparse_spgemm_alg_default,
+        rocsparse_spgemm_stage_symbolic, &this->buffer_size_,
+        this->workspace_));
+  }
+
+  template <matrix A, matrix B, matrix C, matrix D>
+    requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+             __detail::is_csr_view_v<C> && __detail::has_csr_base<D>
+  void multiply_numeric(A&& a, B&& b, C&& c, D&& d) {
+    auto a_base = __detail::get_ultimate_base(a);
+    auto b_base = __detail::get_ultimate_base(b);
+    using matrix_type = decltype(a_base);
+    using input_type = decltype(b_base);
+    using output_type = std::remove_reference_t<decltype(c)>;
+    using value_type = typename matrix_type::scalar_type;
+
+    auto alpha_optional = __detail::get_scaling_factor(a, b);
+    tensor_scalar_t<A> alpha = alpha_optional.value_or(1);
+    value_type alpha_val = alpha;
+    auto beta_optional = __detail::get_scaling_factor(d);
+    value_type beta = beta_optional.value_or(1);
+
+    __rocsparse::throw_if_error(rocsparse_spgemm(
+        this->handle_.get(), rocsparse_operation_none, rocsparse_operation_none,
+        &alpha, this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_,
+        to_rocsparse_datatype<value_type>(), rocsparse_spgemm_alg_default,
+        rocsparse_spgemm_stage_numeric, &this->buffer_size_, this->workspace_));
+  }
+
+private:
+  using handle_manager =
+      std::unique_ptr<std::pointer_traits<rocsparse_handle>::element_type,
+                      std::function<void(rocsparse_handle)>>;
+  handle_manager handle_;
+  rocsparse::hip_allocator<char> alloc_;
+  long unsigned int buffer_size_;
+  char* workspace_;
+  index<index_t> result_shape_;
+  index_t result_nnz_;
+  rocsparse_spmat_descr mat_a_;
+  rocsparse_spmat_descr mat_b_;
+  rocsparse_spmat_descr mat_c_;
+  rocsparse_spmat_descr mat_d_;
+};
+
+template <matrix A, matrix B, matrix C>
+  requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+           __detail::is_csr_view_v<C>
+void multiply_inspect(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c) {}
+
+template <matrix A, matrix B, matrix C, matrix D>
+  requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+           __detail::is_csr_view_v<C> && __detail::has_csr_base<D>
+void multiply_compute(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c,
+                      D&& d) {
+  spgemm_handle.multiply_compute(a, b, c, d);
+}
+
+template <matrix A, matrix B, matrix C, matrix D>
+  requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+           __detail::is_csr_view_v<C> && __detail::has_csr_base<D>
+void multiply_fill(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c, D&& d) {
+  spgemm_handle.multiply_fill(a, b, c, d);
+}
+
+template <matrix A, matrix B, matrix C, matrix D>
+  requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+           __detail::is_csr_view_v<C> && __detail::has_csr_base<D>
+void multiply_symbolic_compute(spgemm_state_t& spgemm_handle, A&& a, B&& b,
+                               C&& c, D&& d) {
+  spgemm_handle.multiply_compute(a, b, c, d);
+}
+
+template <matrix A, matrix B, matrix C, matrix D>
+  requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+           __detail::is_csr_view_v<C> && __detail::has_csr_base<D>
+void multiply_symbolic_fill(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c,
+                            D&& d) {
+  spgemm_handle.multiply_symbolic_fill(a, b, c, d);
+}
+
+template <matrix A, matrix B, matrix C, matrix D>
+  requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+           __detail::is_csr_view_v<C> && __detail::has_csr_base<D>
+void multiply_numeric(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c,
+                      D&& d) {
+  spgemm_handle.multiply_numeric(a, b, c, d);
+}
+
+// the followings support C = A*B by giving null D matrix.
+template <matrix A, matrix B, matrix C>
+  requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+           __detail::is_csr_view_v<C>
+void multiply_compute(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c) {
+  auto d = __rocsparse::create_null_matrix<std::remove_reference_t<C>>();
+  spgemm_handle.multiply_compute(a, b, c, scaled(0.0, d));
+}
+
+template <matrix A, matrix B, matrix C>
+  requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+           __detail::is_csr_view_v<C>
+void multiply_fill(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c) {
+  auto d = __rocsparse::create_null_matrix<std::remove_reference_t<C>>();
+  spgemm_handle.multiply_fill(a, b, c, scaled(0.0, d));
+}
+
+template <matrix A, matrix B, matrix C>
+  requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+           __detail::is_csr_view_v<C>
+void multiply_symbolic_compute(spgemm_state_t& spgemm_handle, A&& a, B&& b,
+                               C&& c) {
+  auto d = __rocsparse::create_null_matrix<std::remove_reference_t<C>>();
+  spgemm_handle.multiply_compute(a, b, c, scaled(0.0, d));
+}
+
+template <matrix A, matrix B, matrix C>
+  requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+           __detail::is_csr_view_v<C>
+void multiply_symbolic_fill(spgemm_state_t& spgemm_handle, A&& a, B&& b,
+                            C&& c) {
+  auto d = __rocsparse::create_null_matrix<std::remove_reference_t<C>>();
+  spgemm_handle.multiply_symbolic_fill(a, b, c, scaled(0.0, d));
+}
+
+template <matrix A, matrix B, matrix C>
+  requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+           __detail::is_csr_view_v<C>
+void multiply_numeric(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c) {
+  auto d = __rocsparse::create_null_matrix<std::remove_reference_t<C>>();
+  spgemm_handle.multiply_numeric(a, b, c, scaled(0.0, d));
+}
+
+} // namespace spblas
diff --git a/include/spblas/vendor/rocsparse/rocsparse.hpp b/include/spblas/vendor/rocsparse/rocsparse.hpp
index 7caa698..014b2ba 100644
--- a/include/spblas/vendor/rocsparse/rocsparse.hpp
+++ b/include/spblas/vendor/rocsparse/rocsparse.hpp
@@ -1,3 +1,4 @@
 #pragma once
 
 #include "multiply.hpp"
+#include "multiply_spgemm.hpp"

From 9263ec1978212f5437695f53f8787a76ecf18d70 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 27 Mar 2025 18:26:42 +0100
Subject: [PATCH 02/14] add rocsparse spgemm with 3 args test

---
 include/spblas/vendor/rocsparse/exception.hpp |   4 +-
 test/gtest/CMakeLists.txt                     |   6 +-
 test/gtest/device/spgemm_test.cpp             | 273 ++++++++++++++++++
 3 files changed, 280 insertions(+), 3 deletions(-)
 create mode 100644 test/gtest/device/spgemm_test.cpp

diff --git a/include/spblas/vendor/rocsparse/exception.hpp b/include/spblas/vendor/rocsparse/exception.hpp
index 6a2ed5c..cb836b5 100644
--- a/include/spblas/vendor/rocsparse/exception.hpp
+++ b/include/spblas/vendor/rocsparse/exception.hpp
@@ -10,7 +10,7 @@ namespace spblas {
 namespace __rocsparse {
 
 // Throw an exception if the hipError_t is not hipSuccess.
-void throw_if_error(hipError_t error_code, std::string prefix = "") {
+inline void throw_if_error(hipError_t error_code, std::string prefix = "") {
   if (error_code == hipSuccess) {
     return;
   }
@@ -21,7 +21,7 @@ void throw_if_error(hipError_t error_code, std::string prefix = "") {
 }
 
 // Throw an exception if the rocsparse_status is not rocsparse_status_success.
-void throw_if_error(rocsparse_status error_code) {
+inline void throw_if_error(rocsparse_status error_code) {
   if (error_code == rocsparse_status_success) {
     return;
   } else if (error_code == rocsparse_status_invalid_handle) {
diff --git a/test/gtest/CMakeLists.txt b/test/gtest/CMakeLists.txt
index 97c54d2..2ac50d3 100644
--- a/test/gtest/CMakeLists.txt
+++ b/test/gtest/CMakeLists.txt
@@ -22,7 +22,11 @@ if (NOT SPBLAS_GPU_BACKEND)
     triangular_solve_test.cpp
   )
 else()
-  set(TEST_SOURCES device/spmv_test.cpp)
+  if(ENABLE_ROCSPARSE)
+    set(TEST_SOURCES device/spmv_test.cpp device/spgemm_test.cpp)
+  else()
+    set(TEST_SOURCES device/spmv_test.cpp)
+  endif()
   add_device_test(TEST_SOURCES)
 endif()
 
diff --git a/test/gtest/device/spgemm_test.cpp b/test/gtest/device/spgemm_test.cpp
new file mode 100644
index 0000000..4662ffd
--- /dev/null
+++ b/test/gtest/device/spgemm_test.cpp
@@ -0,0 +1,273 @@
+
+#include "../util.hpp"
+#include <spblas/spblas.hpp>
+
+#include <gtest/gtest.h>
+#include <thrust/device_vector.h>
+
+using value_t = float;
+using index_t = spblas::index_t;
+using offset_t = spblas::offset_t;
+
+TEST(CsrView, SpGEMM) {
+  for (auto&& [m, k, nnz] : util::dims) {
+    for (auto&& n : {m, k}) {
+      auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] =
+          spblas::generate_csr<value_t, index_t, offset_t>(m, k, nnz);
+      thrust::device_vector<value_t> d_a_values(a_values);
+      thrust::device_vector<offset_t> d_a_rowptr(a_rowptr);
+      thrust::device_vector<index_t> d_a_colind(a_colind);
+      spblas::csr_view<value_t, index_t, offset_t> d_a(
+          d_a_values.data().get(), d_a_rowptr.data().get(),
+          d_a_colind.data().get(), a_shape, a_nnz);
+      spblas::csr_view<value_t, index_t, offset_t> a(a_values, a_rowptr,
+                                                     a_colind, a_shape, a_nnz);
+
+      auto [b_values, b_rowptr, b_colind, b_shape, b_nnz] =
+          spblas::generate_csr<value_t, index_t, offset_t>(k, n, nnz);
+      thrust::device_vector<value_t> d_b_values(b_values);
+      thrust::device_vector<offset_t> d_b_rowptr(b_rowptr);
+      thrust::device_vector<index_t> d_b_colind(b_colind);
+      spblas::csr_view<value_t, index_t, offset_t> d_b(
+          d_b_values.data().get(), d_b_rowptr.data().get(),
+          d_b_colind.data().get(), b_shape, b_nnz);
+      spblas::csr_view<value_t, index_t, offset_t> b(b_values, b_rowptr,
+                                                     b_colind, b_shape, b_nnz);
+
+      thrust::device_vector<offset_t> d_c_rowptr(m + 1);
+
+      spblas::csr_view<value_t, index_t, offset_t> d_c(
+          nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0);
+
+      spblas::spgemm_state_t state;
+      spblas::multiply_compute(state, d_a, d_b, d_c);
+      auto nnz = state.result_nnz();
+      thrust::device_vector<value_t> d_c_values(nnz);
+      thrust::device_vector<index_t> d_c_colind(nnz);
+      std::span<value_t> d_c_values_span(d_c_values.data().get(), nnz);
+      std::span<offset_t> d_c_rowptr_span(d_c_rowptr.data().get(), m + 1);
+      std::span<index_t> d_c_colind_span(d_c_colind.data().get(), nnz);
+      d_c.update(d_c_values_span, d_c_rowptr_span, d_c_colind_span, {m, n},
+                 nnz);
+
+      spblas::multiply_fill(state, d_a, d_b, d_c);
+
+      std::vector<value_t> c_values(nnz);
+      std::vector<offset_t> c_rowptr(m + 1);
+      std::vector<index_t> c_colind(nnz);
+      thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin());
+      thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin());
+      thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin());
+      spblas::csr_view<value_t, index_t, offset_t> c(c_values, c_rowptr,
+                                                     c_colind, {m, n}, nnz);
+
+      spblas::__backend::spa_accumulator<value_t, index_t> c_row_ref(
+          spblas::__backend::shape(c)[1]);
+
+      spblas::__backend::spa_accumulator<value_t, index_t> c_row_acc(
+          spblas::__backend::shape(c)[1]);
+
+      for (auto&& [i, a_row] : spblas::__backend::rows(a)) {
+        c_row_ref.clear();
+        for (auto&& [k, a_v] : a_row) {
+          auto&& b_row = spblas::__backend::lookup_row(b, k);
+
+          for (auto&& [j, b_v] : b_row) {
+            c_row_ref[j] += a_v * b_v;
+          }
+        }
+
+        auto&& c_row = spblas::__backend::lookup_row(c, i);
+
+        // Accumulate output into `c_row_acc` so that we can allow
+        // duplicate column indices.
+        c_row_acc.clear();
+        for (auto&& [j, c_v] : c_row) {
+          c_row_acc[j] += c_v;
+        }
+
+        for (auto&& [j, c_v] : c_row) {
+          EXPECT_EQ_(c_row_ref[j], c_row_acc[j]);
+        }
+
+        EXPECT_EQ(c_row_ref.size(), c_row_acc.size());
+      }
+    }
+  }
+}
+
+TEST(CsrView, SpGEMM_AScaled) {
+  value_t alpha = 2.0f;
+  for (auto&& [m, k, nnz] : util::dims) {
+    for (auto&& n : {m, k}) {
+      auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] =
+          spblas::generate_csr<value_t, index_t, offset_t>(m, k, nnz);
+      thrust::device_vector<value_t> d_a_values(a_values);
+      thrust::device_vector<offset_t> d_a_rowptr(a_rowptr);
+      thrust::device_vector<index_t> d_a_colind(a_colind);
+      spblas::csr_view<value_t, index_t, offset_t> d_a(
+          d_a_values.data().get(), d_a_rowptr.data().get(),
+          d_a_colind.data().get(), a_shape, a_nnz);
+      spblas::csr_view<value_t, index_t, offset_t> a(a_values, a_rowptr,
+                                                     a_colind, a_shape, a_nnz);
+
+      auto [b_values, b_rowptr, b_colind, b_shape, b_nnz] =
+          spblas::generate_csr<value_t, index_t, offset_t>(k, n, nnz);
+      thrust::device_vector<value_t> d_b_values(b_values);
+      thrust::device_vector<offset_t> d_b_rowptr(b_rowptr);
+      thrust::device_vector<index_t> d_b_colind(b_colind);
+      spblas::csr_view<value_t, index_t, offset_t> d_b(
+          d_b_values.data().get(), d_b_rowptr.data().get(),
+          d_b_colind.data().get(), b_shape, b_nnz);
+      spblas::csr_view<value_t, index_t, offset_t> b(b_values, b_rowptr,
+                                                     b_colind, b_shape, b_nnz);
+
+      thrust::device_vector<offset_t> d_c_rowptr(m + 1);
+
+      spblas::csr_view<value_t, index_t, offset_t> d_c(
+          nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0);
+
+      spblas::spgemm_state_t state;
+      spblas::multiply_compute(state, spblas::scaled(alpha, d_a), d_b, d_c);
+      auto nnz = state.result_nnz();
+      thrust::device_vector<value_t> d_c_values(nnz);
+      thrust::device_vector<index_t> d_c_colind(nnz);
+      std::span<value_t> d_c_values_span(d_c_values.data().get(), nnz);
+      std::span<offset_t> d_c_rowptr_span(d_c_rowptr.data().get(), m + 1);
+      std::span<index_t> d_c_colind_span(d_c_colind.data().get(), nnz);
+      d_c.update(d_c_values_span, d_c_rowptr_span, d_c_colind_span, {m, n},
+                 nnz);
+
+      spblas::multiply_fill(state, spblas::scaled(alpha, d_a), d_b, d_c);
+
+      std::vector<value_t> c_values(nnz);
+      std::vector<offset_t> c_rowptr(m + 1);
+      std::vector<index_t> c_colind(nnz);
+      thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin());
+      thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin());
+      thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin());
+      spblas::csr_view<value_t, index_t, offset_t> c(c_values, c_rowptr,
+                                                     c_colind, {m, n}, nnz);
+
+      spblas::__backend::spa_accumulator<value_t, index_t> c_row_ref(
+          spblas::__backend::shape(c)[1]);
+
+      spblas::__backend::spa_accumulator<value_t, index_t> c_row_acc(
+          spblas::__backend::shape(c)[1]);
+
+      for (auto&& [i, a_row] : spblas::__backend::rows(a)) {
+        c_row_ref.clear();
+        for (auto&& [k, a_v] : a_row) {
+          auto&& b_row = spblas::__backend::lookup_row(b, k);
+
+          for (auto&& [j, b_v] : b_row) {
+            c_row_ref[j] += alpha * a_v * b_v;
+          }
+        }
+
+        auto&& c_row = spblas::__backend::lookup_row(c, i);
+
+        // Accumulate output into `c_row_acc` so that we can allow
+        // duplicate column indices.
+        c_row_acc.clear();
+        for (auto&& [j, c_v] : c_row) {
+          c_row_acc[j] += c_v;
+        }
+
+        for (auto&& [j, c_v] : c_row) {
+          EXPECT_EQ_(c_row_ref[j], c_row_acc[j]);
+        }
+
+        EXPECT_EQ(c_row_ref.size(), c_row_acc.size());
+      }
+    }
+  }
+}
+
+TEST(CsrView, SpGEMM_BScaled) {
+  value_t alpha = 2.0f;
+  for (auto&& [m, k, nnz] : util::dims) {
+    for (auto&& n : {m, k}) {
+      auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] =
+          spblas::generate_csr<value_t, index_t, offset_t>(m, k, nnz);
+      thrust::device_vector<value_t> d_a_values(a_values);
+      thrust::device_vector<offset_t> d_a_rowptr(a_rowptr);
+      thrust::device_vector<index_t> d_a_colind(a_colind);
+      spblas::csr_view<value_t, index_t, offset_t> d_a(
+          d_a_values.data().get(), d_a_rowptr.data().get(),
+          d_a_colind.data().get(), a_shape, a_nnz);
+      spblas::csr_view<value_t, index_t, offset_t> a(a_values, a_rowptr,
+                                                     a_colind, a_shape, a_nnz);
+
+      auto [b_values, b_rowptr, b_colind, b_shape, b_nnz] =
+          spblas::generate_csr<value_t, index_t, offset_t>(k, n, nnz);
+      thrust::device_vector<value_t> d_b_values(b_values);
+      thrust::device_vector<offset_t> d_b_rowptr(b_rowptr);
+      thrust::device_vector<index_t> d_b_colind(b_colind);
+      spblas::csr_view<value_t, index_t, offset_t> d_b(
+          d_b_values.data().get(), d_b_rowptr.data().get(),
+          d_b_colind.data().get(), b_shape, b_nnz);
+      spblas::csr_view<value_t, index_t, offset_t> b(b_values, b_rowptr,
+                                                     b_colind, b_shape, b_nnz);
+
+      thrust::device_vector<offset_t> d_c_rowptr(m + 1);
+
+      spblas::csr_view<value_t, index_t, offset_t> d_c(
+          nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0);
+
+      spblas::spgemm_state_t state;
+      spblas::multiply_compute(state, d_a, spblas::scaled(alpha, d_b), d_c);
+      auto nnz = state.result_nnz();
+      thrust::device_vector<value_t> d_c_values(nnz);
+      thrust::device_vector<index_t> d_c_colind(nnz);
+      std::span<value_t> d_c_values_span(d_c_values.data().get(), nnz);
+      std::span<offset_t> d_c_rowptr_span(d_c_rowptr.data().get(), m + 1);
+      std::span<index_t> d_c_colind_span(d_c_colind.data().get(), nnz);
+      d_c.update(d_c_values_span, d_c_rowptr_span, d_c_colind_span, {m, n},
+                 nnz);
+
+      spblas::multiply_fill(state, d_a, spblas::scaled(alpha, d_b), d_c);
+
+      std::vector<value_t> c_values(nnz);
+      std::vector<offset_t> c_rowptr(m + 1);
+      std::vector<index_t> c_colind(nnz);
+      thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin());
+      thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin());
+      thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin());
+      spblas::csr_view<value_t, index_t, offset_t> c(c_values, c_rowptr,
+                                                     c_colind, {m, n}, nnz);
+
+      spblas::__backend::spa_accumulator<value_t, index_t> c_row_ref(
+          spblas::__backend::shape(c)[1]);
+
+      spblas::__backend::spa_accumulator<value_t, index_t> c_row_acc(
+          spblas::__backend::shape(c)[1]);
+
+      for (auto&& [i, a_row] : spblas::__backend::rows(a)) {
+        c_row_ref.clear();
+        for (auto&& [k, a_v] : a_row) {
+          auto&& b_row = spblas::__backend::lookup_row(b, k);
+
+          for (auto&& [j, b_v] : b_row) {
+            c_row_ref[j] += alpha * a_v * b_v;
+          }
+        }
+
+        auto&& c_row = spblas::__backend::lookup_row(c, i);
+
+        // Accumulate output into `c_row_acc` so that we can allow
+        // duplicate column indices.
+        c_row_acc.clear();
+        for (auto&& [j, c_v] : c_row) {
+          c_row_acc[j] += c_v;
+        }
+
+        for (auto&& [j, c_v] : c_row) {
+          EXPECT_EQ_(c_row_ref[j], c_row_acc[j]);
+        }
+
+        EXPECT_EQ(c_row_ref.size(), c_row_acc.size());
+      }
+    }
+  }
+}

From 59cfabe2dad7a806f68c552bbac2773e96e41fac Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 27 Mar 2025 18:38:26 +0100
Subject: [PATCH 03/14] add rocsparse spgemm with 4 args test

---
 test/gtest/CMakeLists.txt                     |   2 +-
 .../device/rocsparse/spgemm_4args_test.cpp    | 421 ++++++++++++++++++
 2 files changed, 422 insertions(+), 1 deletion(-)
 create mode 100644 test/gtest/device/rocsparse/spgemm_4args_test.cpp

diff --git a/test/gtest/CMakeLists.txt b/test/gtest/CMakeLists.txt
index 2ac50d3..c3b554b 100644
--- a/test/gtest/CMakeLists.txt
+++ b/test/gtest/CMakeLists.txt
@@ -23,7 +23,7 @@ if (NOT SPBLAS_GPU_BACKEND)
   )
 else()
   if(ENABLE_ROCSPARSE)
-    set(TEST_SOURCES device/spmv_test.cpp device/spgemm_test.cpp)
+    set(TEST_SOURCES device/spmv_test.cpp device/spgemm_test.cpp device/rocsparse/spgemm_4args_test.cpp)
   else()
     set(TEST_SOURCES device/spmv_test.cpp)
   endif()
diff --git a/test/gtest/device/rocsparse/spgemm_4args_test.cpp b/test/gtest/device/rocsparse/spgemm_4args_test.cpp
new file mode 100644
index 0000000..f1b272b
--- /dev/null
+++ b/test/gtest/device/rocsparse/spgemm_4args_test.cpp
@@ -0,0 +1,421 @@
+
+#include "../../util.hpp"
+#include <spblas/spblas.hpp>
+
+#include <gtest/gtest.h>
+#include <thrust/device_vector.h>
+
+using value_t = float;
+using index_t = spblas::index_t;
+using offset_t = spblas::offset_t;
+
+TEST(CsrView, SpGEMM_4Args) {
+  for (auto&& [m, k, nnz] : util::dims) {
+    for (auto&& n : {m, k}) {
+      auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] =
+          spblas::generate_csr<value_t, index_t, offset_t>(m, k, nnz);
+      thrust::device_vector<value_t> d_a_values(a_values);
+      thrust::device_vector<offset_t> d_a_rowptr(a_rowptr);
+      thrust::device_vector<index_t> d_a_colind(a_colind);
+      spblas::csr_view<value_t, index_t, offset_t> d_a(
+          d_a_values.data().get(), d_a_rowptr.data().get(),
+          d_a_colind.data().get(), a_shape, a_nnz);
+      spblas::csr_view<value_t, index_t, offset_t> a(a_values, a_rowptr,
+                                                     a_colind, a_shape, a_nnz);
+
+      auto [b_values, b_rowptr, b_colind, b_shape, b_nnz] =
+          spblas::generate_csr<value_t, index_t, offset_t>(k, n, nnz);
+      thrust::device_vector<value_t> d_b_values(b_values);
+      thrust::device_vector<offset_t> d_b_rowptr(b_rowptr);
+      thrust::device_vector<index_t> d_b_colind(b_colind);
+      spblas::csr_view<value_t, index_t, offset_t> d_b(
+          d_b_values.data().get(), d_b_rowptr.data().get(),
+          d_b_colind.data().get(), b_shape, b_nnz);
+      spblas::csr_view<value_t, index_t, offset_t> b(b_values, b_rowptr,
+                                                     b_colind, b_shape, b_nnz);
+
+      auto [d_values, d_rowptr, d_colind, d_shape, d_nnz] =
+          spblas::generate_csr<value_t, index_t, offset_t>(m, n, nnz);
+      thrust::device_vector<value_t> d_d_values(d_values);
+      thrust::device_vector<offset_t> d_d_rowptr(d_rowptr);
+      thrust::device_vector<index_t> d_d_colind(d_colind);
+      spblas::csr_view<value_t, index_t, offset_t> d_d(
+          d_d_values.data().get(), d_d_rowptr.data().get(),
+          d_d_colind.data().get(), d_shape, d_nnz);
+      spblas::csr_view<value_t, index_t, offset_t> d(d_values, d_rowptr,
+                                                     d_colind, d_shape, d_nnz);
+
+      thrust::device_vector<offset_t> d_c_rowptr(m + 1);
+
+      spblas::csr_view<value_t, index_t, offset_t> d_c(
+          nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0);
+
+      spblas::spgemm_state_t state;
+      spblas::multiply_compute(state, d_a, d_b, d_c, d_d);
+      auto nnz = state.result_nnz();
+      thrust::device_vector<value_t> d_c_values(nnz);
+      thrust::device_vector<index_t> d_c_colind(nnz);
+      std::span<value_t> d_c_values_span(d_c_values.data().get(), nnz);
+      std::span<offset_t> d_c_rowptr_span(d_c_rowptr.data().get(), m + 1);
+      std::span<index_t> d_c_colind_span(d_c_colind.data().get(), nnz);
+      d_c.update(d_c_values_span, d_c_rowptr_span, d_c_colind_span, {m, n},
+                 nnz);
+
+      spblas::multiply_fill(state, d_a, d_b, d_c, d_d);
+
+      std::vector<value_t> c_values(nnz);
+      std::vector<offset_t> c_rowptr(m + 1);
+      std::vector<index_t> c_colind(nnz);
+      thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin());
+      thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin());
+      thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin());
+      spblas::csr_view<value_t, index_t, offset_t> c(c_values, c_rowptr,
+                                                     c_colind, {m, n}, nnz);
+
+      spblas::__backend::spa_accumulator<value_t, index_t> c_row_ref(
+          spblas::__backend::shape(c)[1]);
+
+      spblas::__backend::spa_accumulator<value_t, index_t> c_row_acc(
+          spblas::__backend::shape(c)[1]);
+
+      for (auto&& [i, a_row] : spblas::__backend::rows(a)) {
+        c_row_ref.clear();
+        for (auto&& [k, a_v] : a_row) {
+          auto&& b_row = spblas::__backend::lookup_row(b, k);
+
+          for (auto&& [j, b_v] : b_row) {
+            c_row_ref[j] += a_v * b_v;
+          }
+        }
+        auto&& d_row = spblas::__backend::lookup_row(d, i);
+        for (auto&& [k, d_v] : d_row) {
+          c_row_ref[k] += d_v;
+        }
+
+        auto&& c_row = spblas::__backend::lookup_row(c, i);
+
+        // Accumulate output into `c_row_acc` so that we can allow
+        // duplicate column indices.
+        c_row_acc.clear();
+        for (auto&& [j, c_v] : c_row) {
+          c_row_acc[j] += c_v;
+        }
+
+        for (auto&& [j, c_v] : c_row) {
+          EXPECT_EQ_(c_row_ref[j], c_row_acc[j]);
+        }
+
+        EXPECT_EQ(c_row_ref.size(), c_row_acc.size());
+      }
+    }
+  }
+}
+
+TEST(CsrView, SpGEMM_4Args_AScaled) {
+  value_t alpha = 2.0f;
+  for (auto&& [m, k, nnz] : util::dims) {
+    for (auto&& n : {m, k}) {
+      auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] =
+          spblas::generate_csr<value_t, index_t, offset_t>(m, k, nnz);
+      thrust::device_vector<value_t> d_a_values(a_values);
+      thrust::device_vector<offset_t> d_a_rowptr(a_rowptr);
+      thrust::device_vector<index_t> d_a_colind(a_colind);
+      spblas::csr_view<value_t, index_t, offset_t> d_a(
+          d_a_values.data().get(), d_a_rowptr.data().get(),
+          d_a_colind.data().get(), a_shape, a_nnz);
+      spblas::csr_view<value_t, index_t, offset_t> a(a_values, a_rowptr,
+                                                     a_colind, a_shape, a_nnz);
+
+      auto [b_values, b_rowptr, b_colind, b_shape, b_nnz] =
+          spblas::generate_csr<value_t, index_t, offset_t>(k, n, nnz);
+      thrust::device_vector<value_t> d_b_values(b_values);
+      thrust::device_vector<offset_t> d_b_rowptr(b_rowptr);
+      thrust::device_vector<index_t> d_b_colind(b_colind);
+      spblas::csr_view<value_t, index_t, offset_t> d_b(
+          d_b_values.data().get(), d_b_rowptr.data().get(),
+          d_b_colind.data().get(), b_shape, b_nnz);
+      spblas::csr_view<value_t, index_t, offset_t> b(b_values, b_rowptr,
+                                                     b_colind, b_shape, b_nnz);
+
+      auto [d_values, d_rowptr, d_colind, d_shape, d_nnz] =
+          spblas::generate_csr<value_t, index_t, offset_t>(m, n, nnz);
+      thrust::device_vector<value_t> d_d_values(d_values);
+      thrust::device_vector<offset_t> d_d_rowptr(d_rowptr);
+      thrust::device_vector<index_t> d_d_colind(d_colind);
+      spblas::csr_view<value_t, index_t, offset_t> d_d(
+          d_d_values.data().get(), d_d_rowptr.data().get(),
+          d_d_colind.data().get(), d_shape, d_nnz);
+      spblas::csr_view<value_t, index_t, offset_t> d(d_values, d_rowptr,
+                                                     d_colind, d_shape, d_nnz);
+
+      thrust::device_vector<offset_t> d_c_rowptr(m + 1);
+
+      spblas::csr_view<value_t, index_t, offset_t> d_c(
+          nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0);
+
+      spblas::spgemm_state_t state;
+      spblas::multiply_compute(state, scaled(alpha, d_a), d_b, d_c, d_d);
+      auto nnz = state.result_nnz();
+      thrust::device_vector<value_t> d_c_values(nnz);
+      thrust::device_vector<index_t> d_c_colind(nnz);
+      std::span<value_t> d_c_values_span(d_c_values.data().get(), nnz);
+      std::span<offset_t> d_c_rowptr_span(d_c_rowptr.data().get(), m + 1);
+      std::span<index_t> d_c_colind_span(d_c_colind.data().get(), nnz);
+      d_c.update(d_c_values_span, d_c_rowptr_span, d_c_colind_span, {m, n},
+                 nnz);
+
+      spblas::multiply_fill(state, scaled(alpha, d_a), d_b, d_c, d_d);
+
+      std::vector<value_t> c_values(nnz);
+      std::vector<offset_t> c_rowptr(m + 1);
+      std::vector<index_t> c_colind(nnz);
+      thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin());
+      thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin());
+      thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin());
+      spblas::csr_view<value_t, index_t, offset_t> c(c_values, c_rowptr,
+                                                     c_colind, {m, n}, nnz);
+
+      spblas::__backend::spa_accumulator<value_t, index_t> c_row_ref(
+          spblas::__backend::shape(c)[1]);
+
+      spblas::__backend::spa_accumulator<value_t, index_t> c_row_acc(
+          spblas::__backend::shape(c)[1]);
+
+      for (auto&& [i, a_row] : spblas::__backend::rows(a)) {
+        c_row_ref.clear();
+        for (auto&& [k, a_v] : a_row) {
+          auto&& b_row = spblas::__backend::lookup_row(b, k);
+
+          for (auto&& [j, b_v] : b_row) {
+            c_row_ref[j] += alpha * a_v * b_v;
+          }
+        }
+        auto&& d_row = spblas::__backend::lookup_row(d, i);
+        for (auto&& [k, d_v] : d_row) {
+          c_row_ref[k] += d_v;
+        }
+
+        auto&& c_row = spblas::__backend::lookup_row(c, i);
+
+        // Accumulate output into `c_row_acc` so that we can allow
+        // duplicate column indices.
+        c_row_acc.clear();
+        for (auto&& [j, c_v] : c_row) {
+          c_row_acc[j] += c_v;
+        }
+
+        for (auto&& [j, c_v] : c_row) {
+          EXPECT_EQ_(c_row_ref[j], c_row_acc[j]);
+        }
+
+        EXPECT_EQ(c_row_ref.size(), c_row_acc.size());
+      }
+    }
+  }
+}
+
+TEST(CsrView, SpGEMM_4Args_BScaled) {
+  value_t alpha = 2.0f;
+  for (auto&& [m, k, nnz] : util::dims) {
+    for (auto&& n : {m, k}) {
+      auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] =
+          spblas::generate_csr<value_t, index_t, offset_t>(m, k, nnz);
+      thrust::device_vector<value_t> d_a_values(a_values);
+      thrust::device_vector<offset_t> d_a_rowptr(a_rowptr);
+      thrust::device_vector<index_t> d_a_colind(a_colind);
+      spblas::csr_view<value_t, index_t, offset_t> d_a(
+          d_a_values.data().get(), d_a_rowptr.data().get(),
+          d_a_colind.data().get(), a_shape, a_nnz);
+      spblas::csr_view<value_t, index_t, offset_t> a(a_values, a_rowptr,
+                                                     a_colind, a_shape, a_nnz);
+
+      auto [b_values, b_rowptr, b_colind, b_shape, b_nnz] =
+          spblas::generate_csr<value_t, index_t, offset_t>(k, n, nnz);
+      thrust::device_vector<value_t> d_b_values(b_values);
+      thrust::device_vector<offset_t> d_b_rowptr(b_rowptr);
+      thrust::device_vector<index_t> d_b_colind(b_colind);
+      spblas::csr_view<value_t, index_t, offset_t> d_b(
+          d_b_values.data().get(), d_b_rowptr.data().get(),
+          d_b_colind.data().get(), b_shape, b_nnz);
+      spblas::csr_view<value_t, index_t, offset_t> b(b_values, b_rowptr,
+                                                     b_colind, b_shape, b_nnz);
+
+      auto [d_values, d_rowptr, d_colind, d_shape, d_nnz] =
+          spblas::generate_csr<value_t, index_t, offset_t>(m, n, nnz);
+      thrust::device_vector<value_t> d_d_values(d_values);
+      thrust::device_vector<offset_t> d_d_rowptr(d_rowptr);
+      thrust::device_vector<index_t> d_d_colind(d_colind);
+      spblas::csr_view<value_t, index_t, offset_t> d_d(
+          d_d_values.data().get(), d_d_rowptr.data().get(),
+          d_d_colind.data().get(), d_shape, d_nnz);
+      spblas::csr_view<value_t, index_t, offset_t> d(d_values, d_rowptr,
+                                                     d_colind, d_shape, d_nnz);
+
+      thrust::device_vector<offset_t> d_c_rowptr(m + 1);
+
+      spblas::csr_view<value_t, index_t, offset_t> d_c(
+          nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0);
+
+      spblas::spgemm_state_t state;
+      spblas::multiply_compute(state, d_a, scaled(alpha, d_b), d_c, d_d);
+      auto nnz = state.result_nnz();
+      thrust::device_vector<value_t> d_c_values(nnz);
+      thrust::device_vector<index_t> d_c_colind(nnz);
+      std::span<value_t> d_c_values_span(d_c_values.data().get(), nnz);
+      std::span<offset_t> d_c_rowptr_span(d_c_rowptr.data().get(), m + 1);
+      std::span<index_t> d_c_colind_span(d_c_colind.data().get(), nnz);
+      d_c.update(d_c_values_span, d_c_rowptr_span, d_c_colind_span, {m, n},
+                 nnz);
+
+      spblas::multiply_fill(state, d_a, scaled(alpha, d_b), d_c, d_d);
+
+      std::vector<value_t> c_values(nnz);
+      std::vector<offset_t> c_rowptr(m + 1);
+      std::vector<index_t> c_colind(nnz);
+      thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin());
+      thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin());
+      thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin());
+      spblas::csr_view<value_t, index_t, offset_t> c(c_values, c_rowptr,
+                                                     c_colind, {m, n}, nnz);
+
+      spblas::__backend::spa_accumulator<value_t, index_t> c_row_ref(
+          spblas::__backend::shape(c)[1]);
+
+      spblas::__backend::spa_accumulator<value_t, index_t> c_row_acc(
+          spblas::__backend::shape(c)[1]);
+
+      for (auto&& [i, a_row] : spblas::__backend::rows(a)) {
+        c_row_ref.clear();
+        for (auto&& [k, a_v] : a_row) {
+          auto&& b_row = spblas::__backend::lookup_row(b, k);
+
+          for (auto&& [j, b_v] : b_row) {
+            c_row_ref[j] += alpha * a_v * b_v;
+          }
+        }
+        auto&& d_row = spblas::__backend::lookup_row(d, i);
+        for (auto&& [k, d_v] : d_row) {
+          c_row_ref[k] += d_v;
+        }
+
+        auto&& c_row = spblas::__backend::lookup_row(c, i);
+
+        // Accumulate output into `c_row_acc` so that we can allow
+        // duplicate column indices.
+        c_row_acc.clear();
+        for (auto&& [j, c_v] : c_row) {
+          c_row_acc[j] += c_v;
+        }
+
+        for (auto&& [j, c_v] : c_row) {
+          EXPECT_EQ_(c_row_ref[j], c_row_acc[j]);
+        }
+
+        EXPECT_EQ(c_row_ref.size(), c_row_acc.size());
+      }
+    }
+  }
+}
+
+TEST(CsrView, SpGEMM_4Args_DScaled) {
+  value_t alpha = 2.0f;
+  for (auto&& [m, k, nnz] : util::dims) {
+    for (auto&& n : {m, k}) {
+      auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] =
+          spblas::generate_csr<value_t, index_t, offset_t>(m, k, nnz);
+      thrust::device_vector<value_t> d_a_values(a_values);
+      thrust::device_vector<offset_t> d_a_rowptr(a_rowptr);
+      thrust::device_vector<index_t> d_a_colind(a_colind);
+      spblas::csr_view<value_t, index_t, offset_t> d_a(
+          d_a_values.data().get(), d_a_rowptr.data().get(),
+          d_a_colind.data().get(), a_shape, a_nnz);
+      spblas::csr_view<value_t, index_t, offset_t> a(a_values, a_rowptr,
+                                                     a_colind, a_shape, a_nnz);
+
+      auto [b_values, b_rowptr, b_colind, b_shape, b_nnz] =
+          spblas::generate_csr<value_t, index_t, offset_t>(k, n, nnz);
+      thrust::device_vector<value_t> d_b_values(b_values);
+      thrust::device_vector<offset_t> d_b_rowptr(b_rowptr);
+      thrust::device_vector<index_t> d_b_colind(b_colind);
+      spblas::csr_view<value_t, index_t, offset_t> d_b(
+          d_b_values.data().get(), d_b_rowptr.data().get(),
+          d_b_colind.data().get(), b_shape, b_nnz);
+      spblas::csr_view<value_t, index_t, offset_t> b(b_values, b_rowptr,
+                                                     b_colind, b_shape, b_nnz);
+
+      auto [d_values, d_rowptr, d_colind, d_shape, d_nnz] =
+          spblas::generate_csr<value_t, index_t, offset_t>(m, n, nnz);
+      thrust::device_vector<value_t> d_d_values(d_values);
+      thrust::device_vector<offset_t> d_d_rowptr(d_rowptr);
+      thrust::device_vector<index_t> d_d_colind(d_colind);
+      spblas::csr_view<value_t, index_t, offset_t> d_d(
+          d_d_values.data().get(), d_d_rowptr.data().get(),
+          d_d_colind.data().get(), d_shape, d_nnz);
+      spblas::csr_view<value_t, index_t, offset_t> d(d_values, d_rowptr,
+                                                     d_colind, d_shape, d_nnz);
+
+      thrust::device_vector<offset_t> d_c_rowptr(m + 1);
+
+      spblas::csr_view<value_t, index_t, offset_t> d_c(
+          nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0);
+
+      spblas::spgemm_state_t state;
+      spblas::multiply_compute(state, d_a, d_b, d_c, scaled(alpha, d_d));
+      auto nnz = state.result_nnz();
+      thrust::device_vector<value_t> d_c_values(nnz);
+      thrust::device_vector<index_t> d_c_colind(nnz);
+      std::span<value_t> d_c_values_span(d_c_values.data().get(), nnz);
+      std::span<offset_t> d_c_rowptr_span(d_c_rowptr.data().get(), m + 1);
+      std::span<index_t> d_c_colind_span(d_c_colind.data().get(), nnz);
+      d_c.update(d_c_values_span, d_c_rowptr_span, d_c_colind_span, {m, n},
+                 nnz);
+
+      spblas::multiply_fill(state, d_a, d_b, d_c, scaled(alpha, d_d));
+
+      std::vector<value_t> c_values(nnz);
+      std::vector<offset_t> c_rowptr(m + 1);
+      std::vector<index_t> c_colind(nnz);
+      thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin());
+      thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin());
+      thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin());
+      spblas::csr_view<value_t, index_t, offset_t> c(c_values, c_rowptr,
+                                                     c_colind, {m, n}, nnz);
+
+      spblas::__backend::spa_accumulator<value_t, index_t> c_row_ref(
+          spblas::__backend::shape(c)[1]);
+
+      spblas::__backend::spa_accumulator<value_t, index_t> c_row_acc(
+          spblas::__backend::shape(c)[1]);
+
+      for (auto&& [i, a_row] : spblas::__backend::rows(a)) {
+        c_row_ref.clear();
+        for (auto&& [k, a_v] : a_row) {
+          auto&& b_row = spblas::__backend::lookup_row(b, k);
+
+          for (auto&& [j, b_v] : b_row) {
+            c_row_ref[j] += a_v * b_v;
+          }
+        }
+        auto&& d_row = spblas::__backend::lookup_row(d, i);
+        for (auto&& [k, d_v] : d_row) {
+          c_row_ref[k] += alpha * d_v;
+        }
+
+        auto&& c_row = spblas::__backend::lookup_row(c, i);
+
+        // Accumulate output into `c_row_acc` so that we can allow
+        // duplicate column indices.
+        c_row_acc.clear();
+        for (auto&& [j, c_v] : c_row) {
+          c_row_acc[j] += c_v;
+        }
+
+        for (auto&& [j, c_v] : c_row) {
+          EXPECT_EQ_(c_row_ref[j], c_row_acc[j]);
+        }
+
+        EXPECT_EQ(c_row_ref.size(), c_row_acc.size());
+      }
+    }
+  }
+}

From ed7d26f81c97719119f2e47f64672fe05f4b72b8 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 27 Mar 2025 19:16:27 +0100
Subject: [PATCH 04/14] add rocsparse spgemm symbolic/numeric test

---
 test/gtest/CMakeLists.txt               |   2 +-
 test/gtest/device/spgemm_reuse_test.cpp | 324 ++++++++++++++++++++++++
 2 files changed, 325 insertions(+), 1 deletion(-)
 create mode 100644 test/gtest/device/spgemm_reuse_test.cpp

diff --git a/test/gtest/CMakeLists.txt b/test/gtest/CMakeLists.txt
index c3b554b..18de3e6 100644
--- a/test/gtest/CMakeLists.txt
+++ b/test/gtest/CMakeLists.txt
@@ -23,7 +23,7 @@ if (NOT SPBLAS_GPU_BACKEND)
   )
 else()
   if(ENABLE_ROCSPARSE)
-    set(TEST_SOURCES device/spmv_test.cpp device/spgemm_test.cpp device/rocsparse/spgemm_4args_test.cpp)
+    set(TEST_SOURCES device/spmv_test.cpp device/spgemm_test.cpp device/spgemm_reuse_test.cpp device/rocsparse/spgemm_4args_test.cpp)
   else()
     set(TEST_SOURCES device/spmv_test.cpp)
   endif()
diff --git a/test/gtest/device/spgemm_reuse_test.cpp b/test/gtest/device/spgemm_reuse_test.cpp
new file mode 100644
index 0000000..931c573
--- /dev/null
+++ b/test/gtest/device/spgemm_reuse_test.cpp
@@ -0,0 +1,324 @@
+
+#include "../util.hpp"
+#include <spblas/spblas.hpp>
+
+#include <gtest/gtest.h>
+#include <thrust/device_vector.h>
+
+using value_t = float;
+using index_t = spblas::index_t;
+using offset_t = spblas::offset_t;
+
+TEST(CsrView, SpGEMMReuse) {
+  for (auto&& [m, k, nnz] : util::dims) {
+    for (auto&& n : {m, k}) {
+      auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] =
+          spblas::generate_csr<value_t, index_t, offset_t>(m, k, nnz);
+      thrust::device_vector<value_t> d_a_values(a_values);
+      thrust::device_vector<offset_t> d_a_rowptr(a_rowptr);
+      thrust::device_vector<index_t> d_a_colind(a_colind);
+      spblas::csr_view<value_t, index_t, offset_t> d_a(
+          d_a_values.data().get(), d_a_rowptr.data().get(),
+          d_a_colind.data().get(), a_shape, a_nnz);
+      spblas::csr_view<value_t, index_t, offset_t> a(a_values, a_rowptr,
+                                                     a_colind, a_shape, a_nnz);
+
+      auto [b_values, b_rowptr, b_colind, b_shape, b_nnz] =
+          spblas::generate_csr<value_t, index_t, offset_t>(k, n, nnz);
+      thrust::device_vector<value_t> d_b_values(b_values);
+      thrust::device_vector<offset_t> d_b_rowptr(b_rowptr);
+      thrust::device_vector<index_t> d_b_colind(b_colind);
+      spblas::csr_view<value_t, index_t, offset_t> d_b(
+          d_b_values.data().get(), d_b_rowptr.data().get(),
+          d_b_colind.data().get(), b_shape, b_nnz);
+      spblas::csr_view<value_t, index_t, offset_t> b(b_values, b_rowptr,
+                                                     b_colind, b_shape, b_nnz);
+
+      thrust::device_vector<offset_t> d_c_rowptr(m + 1);
+
+      spblas::csr_view<value_t, index_t, offset_t> d_c(
+          nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0);
+
+      spblas::spgemm_state_t state;
+      spblas::multiply_symbolic_compute(state, d_a, d_b, d_c);
+      auto nnz = state.result_nnz();
+      thrust::device_vector<value_t> d_c_values(nnz);
+      thrust::device_vector<index_t> d_c_colind(nnz);
+      std::span<value_t> d_c_values_span(d_c_values.data().get(), nnz);
+      std::span<offset_t> d_c_rowptr_span(d_c_rowptr.data().get(), m + 1);
+      std::span<index_t> d_c_colind_span(d_c_colind.data().get(), nnz);
+      d_c.update(d_c_values_span, d_c_rowptr_span, d_c_colind_span, {m, n},
+                 nnz);
+
+      spblas::multiply_symbolic_fill(state, d_a, d_b, d_c);
+      std::mt19937 g(0);
+      for (int i = 0; i < 3; i++) {
+        // we can change the value of a and b but only need to call
+        // multiply_numeric answer here.
+        if (i != 0) {
+          // regenerate value of a and b;
+          std::uniform_real_distribution val_dist(0.0, 100.0);
+          for (auto& v : a_values) {
+            v = val_dist(g);
+          }
+          for (auto& v : b_values) {
+            v = val_dist(g);
+          }
+          thrust::copy(a_values.begin(), a_values.end(), d_a_values.begin());
+          thrust::copy(b_values.begin(), b_values.end(), d_b_values.begin());
+        }
+        spblas::multiply_numeric(state, d_a, d_b, d_c);
+        std::vector<value_t> c_values(nnz);
+        std::vector<offset_t> c_rowptr(m + 1);
+        std::vector<index_t> c_colind(nnz);
+        thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin());
+        thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin());
+        thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin());
+        spblas::csr_view<value_t, index_t, offset_t> c(c_values, c_rowptr,
+                                                       c_colind, {m, n}, nnz);
+
+        spblas::__backend::spa_accumulator<value_t, index_t> c_row_ref(
+            spblas::__backend::shape(c)[1]);
+
+        spblas::__backend::spa_accumulator<value_t, index_t> c_row_acc(
+            spblas::__backend::shape(c)[1]);
+
+        for (auto&& [i, a_row] : spblas::__backend::rows(a)) {
+          c_row_ref.clear();
+          for (auto&& [k, a_v] : a_row) {
+            auto&& b_row = spblas::__backend::lookup_row(b, k);
+
+            for (auto&& [j, b_v] : b_row) {
+              c_row_ref[j] += a_v * b_v;
+            }
+          }
+
+          auto&& c_row = spblas::__backend::lookup_row(c, i);
+
+          // Accumulate output into `c_row_acc` so that we can allow
+          // duplicate column indices.
+          c_row_acc.clear();
+          for (auto&& [j, c_v] : c_row) {
+            c_row_acc[j] += c_v;
+          }
+
+          for (auto&& [j, c_v] : c_row) {
+            EXPECT_EQ_(c_row_ref[j], c_row_acc[j]);
+          }
+
+          EXPECT_EQ(c_row_ref.size(), c_row_acc.size());
+        }
+      }
+    }
+  }
+}
+
+TEST(CsrView, SpGEMMReuse_AScaled) {
+  value_t alpha = 2.0f;
+  for (auto&& [m, k, nnz] : util::dims) {
+    for (auto&& n : {m, k}) {
+      auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] =
+          spblas::generate_csr<value_t, index_t, offset_t>(m, k, nnz);
+      thrust::device_vector<value_t> d_a_values(a_values);
+      thrust::device_vector<offset_t> d_a_rowptr(a_rowptr);
+      thrust::device_vector<index_t> d_a_colind(a_colind);
+      spblas::csr_view<value_t, index_t, offset_t> d_a(
+          d_a_values.data().get(), d_a_rowptr.data().get(),
+          d_a_colind.data().get(), a_shape, a_nnz);
+      spblas::csr_view<value_t, index_t, offset_t> a(a_values, a_rowptr,
+                                                     a_colind, a_shape, a_nnz);
+
+      auto [b_values, b_rowptr, b_colind, b_shape, b_nnz] =
+          spblas::generate_csr<value_t, index_t, offset_t>(k, n, nnz);
+      thrust::device_vector<value_t> d_b_values(b_values);
+      thrust::device_vector<offset_t> d_b_rowptr(b_rowptr);
+      thrust::device_vector<index_t> d_b_colind(b_colind);
+      spblas::csr_view<value_t, index_t, offset_t> d_b(
+          d_b_values.data().get(), d_b_rowptr.data().get(),
+          d_b_colind.data().get(), b_shape, b_nnz);
+      spblas::csr_view<value_t, index_t, offset_t> b(b_values, b_rowptr,
+                                                     b_colind, b_shape, b_nnz);
+
+      thrust::device_vector<offset_t> d_c_rowptr(m + 1);
+
+      spblas::csr_view<value_t, index_t, offset_t> d_c(
+          nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0);
+
+      spblas::spgemm_state_t state;
+      spblas::multiply_symbolic_compute(state, scaled(alpha, d_a), d_b, d_c);
+      auto nnz = state.result_nnz();
+      thrust::device_vector<value_t> d_c_values(nnz);
+      thrust::device_vector<index_t> d_c_colind(nnz);
+      std::span<value_t> d_c_values_span(d_c_values.data().get(), nnz);
+      std::span<offset_t> d_c_rowptr_span(d_c_rowptr.data().get(), m + 1);
+      std::span<index_t> d_c_colind_span(d_c_colind.data().get(), nnz);
+      d_c.update(d_c_values_span, d_c_rowptr_span, d_c_colind_span, {m, n},
+                 nnz);
+
+      spblas::multiply_symbolic_fill(state, scaled(alpha, d_a), d_b, d_c);
+      std::mt19937 g(0);
+      for (int i = 0; i < 3; i++) {
+        // we can change the value of a and b but only need to call
+        // multiply_numeric answer here.
+        if (i != 0) {
+          // regenerate value of a and b;
+          std::uniform_real_distribution val_dist(0.0, 100.0);
+          for (auto& v : a_values) {
+            v = val_dist(g);
+          }
+          for (auto& v : b_values) {
+            v = val_dist(g);
+          }
+          thrust::copy(a_values.begin(), a_values.end(), d_a_values.begin());
+          thrust::copy(b_values.begin(), b_values.end(), d_b_values.begin());
+        }
+        spblas::multiply_numeric(state, scaled(alpha, d_a), d_b, d_c);
+        std::vector<value_t> c_values(nnz);
+        std::vector<offset_t> c_rowptr(m + 1);
+        std::vector<index_t> c_colind(nnz);
+        thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin());
+        thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin());
+        thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin());
+        spblas::csr_view<value_t, index_t, offset_t> c(c_values, c_rowptr,
+                                                       c_colind, {m, n}, nnz);
+
+        spblas::__backend::spa_accumulator<value_t, index_t> c_row_ref(
+            spblas::__backend::shape(c)[1]);
+
+        spblas::__backend::spa_accumulator<value_t, index_t> c_row_acc(
+            spblas::__backend::shape(c)[1]);
+
+        for (auto&& [i, a_row] : spblas::__backend::rows(a)) {
+          c_row_ref.clear();
+          for (auto&& [k, a_v] : a_row) {
+            auto&& b_row = spblas::__backend::lookup_row(b, k);
+
+            for (auto&& [j, b_v] : b_row) {
+              c_row_ref[j] += alpha * a_v * b_v;
+            }
+          }
+
+          auto&& c_row = spblas::__backend::lookup_row(c, i);
+
+          // Accumulate output into `c_row_acc` so that we can allow
+          // duplicate column indices.
+          c_row_acc.clear();
+          for (auto&& [j, c_v] : c_row) {
+            c_row_acc[j] += c_v;
+          }
+
+          for (auto&& [j, c_v] : c_row) {
+            EXPECT_EQ_(c_row_ref[j], c_row_acc[j]);
+          }
+
+          EXPECT_EQ(c_row_ref.size(), c_row_acc.size());
+        }
+      }
+    }
+  }
+}
+
+TEST(CsrView, SpGEMMReuse_BScaled) {
+  value_t alpha = 2.0f;
+  for (auto&& [m, k, nnz] : util::dims) {
+    for (auto&& n : {m, k}) {
+      auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] =
+          spblas::generate_csr<value_t, index_t, offset_t>(m, k, nnz);
+      thrust::device_vector<value_t> d_a_values(a_values);
+      thrust::device_vector<offset_t> d_a_rowptr(a_rowptr);
+      thrust::device_vector<index_t> d_a_colind(a_colind);
+      spblas::csr_view<value_t, index_t, offset_t> d_a(
+          d_a_values.data().get(), d_a_rowptr.data().get(),
+          d_a_colind.data().get(), a_shape, a_nnz);
+      spblas::csr_view<value_t, index_t, offset_t> a(a_values, a_rowptr,
+                                                     a_colind, a_shape, a_nnz);
+
+      auto [b_values, b_rowptr, b_colind, b_shape, b_nnz] =
+          spblas::generate_csr<value_t, index_t, offset_t>(k, n, nnz);
+      thrust::device_vector<value_t> d_b_values(b_values);
+      thrust::device_vector<offset_t> d_b_rowptr(b_rowptr);
+      thrust::device_vector<index_t> d_b_colind(b_colind);
+      spblas::csr_view<value_t, index_t, offset_t> d_b(
+          d_b_values.data().get(), d_b_rowptr.data().get(),
+          d_b_colind.data().get(), b_shape, b_nnz);
+      spblas::csr_view<value_t, index_t, offset_t> b(b_values, b_rowptr,
+                                                     b_colind, b_shape, b_nnz);
+
+      thrust::device_vector<offset_t> d_c_rowptr(m + 1);
+
+      spblas::csr_view<value_t, index_t, offset_t> d_c(
+          nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0);
+
+      spblas::spgemm_state_t state;
+      spblas::multiply_symbolic_compute(state, d_a, scaled(alpha, d_b), d_c);
+      auto nnz = state.result_nnz();
+      thrust::device_vector<value_t> d_c_values(nnz);
+      thrust::device_vector<index_t> d_c_colind(nnz);
+      std::span<value_t> d_c_values_span(d_c_values.data().get(), nnz);
+      std::span<offset_t> d_c_rowptr_span(d_c_rowptr.data().get(), m + 1);
+      std::span<index_t> d_c_colind_span(d_c_colind.data().get(), nnz);
+      d_c.update(d_c_values_span, d_c_rowptr_span, d_c_colind_span, {m, n},
+                 nnz);
+
+      spblas::multiply_symbolic_fill(state, d_a, scaled(alpha, d_b), d_c);
+      std::mt19937 g(0);
+      for (int i = 0; i < 3; i++) {
+        // we can change the value of a and b but only need to call
+        // multiply_numeric answer here.
+        if (i != 0) {
+          // regenerate value of a and b;
+          std::uniform_real_distribution val_dist(0.0, 100.0);
+          for (auto& v : a_values) {
+            v = val_dist(g);
+          }
+          for (auto& v : b_values) {
+            v = val_dist(g);
+          }
+          thrust::copy(a_values.begin(), a_values.end(), d_a_values.begin());
+          thrust::copy(b_values.begin(), b_values.end(), d_b_values.begin());
+        }
+        spblas::multiply_numeric(state, d_a, scaled(alpha, d_b), d_c);
+        std::vector<value_t> c_values(nnz);
+        std::vector<offset_t> c_rowptr(m + 1);
+        std::vector<index_t> c_colind(nnz);
+        thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin());
+        thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin());
+        thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin());
+        spblas::csr_view<value_t, index_t, offset_t> c(c_values, c_rowptr,
+                                                       c_colind, {m, n}, nnz);
+
+        spblas::__backend::spa_accumulator<value_t, index_t> c_row_ref(
+            spblas::__backend::shape(c)[1]);
+
+        spblas::__backend::spa_accumulator<value_t, index_t> c_row_acc(
+            spblas::__backend::shape(c)[1]);
+
+        for (auto&& [i, a_row] : spblas::__backend::rows(a)) {
+          c_row_ref.clear();
+          for (auto&& [k, a_v] : a_row) {
+            auto&& b_row = spblas::__backend::lookup_row(b, k);
+
+            for (auto&& [j, b_v] : b_row) {
+              c_row_ref[j] += alpha * a_v * b_v;
+            }
+          }
+
+          auto&& c_row = spblas::__backend::lookup_row(c, i);
+
+          // Accumulate output into `c_row_acc` so that we can allow
+          // duplicate column indices.
+          c_row_acc.clear();
+          for (auto&& [j, c_v] : c_row) {
+            c_row_acc[j] += c_v;
+          }
+
+          for (auto&& [j, c_v] : c_row) {
+            EXPECT_EQ_(c_row_ref[j], c_row_acc[j]);
+          }
+
+          EXPECT_EQ(c_row_ref.size(), c_row_acc.size());
+        }
+      }
+    }
+  }
+}

From e7ecada6d7f0a8b808077a231d80e079402560e9 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 17 Apr 2025 16:43:16 +0200
Subject: [PATCH 05/14] add the test when changing the pointer

---
 .../vendor/rocsparse/multiply_spgemm.hpp      |  17 +++
 test/gtest/device/spgemm_reuse_test.cpp       | 124 ++++++++++++++++++
 2 files changed, 141 insertions(+)

diff --git a/include/spblas/vendor/rocsparse/multiply_spgemm.hpp b/include/spblas/vendor/rocsparse/multiply_spgemm.hpp
index 7f215df..74e6f70 100644
--- a/include/spblas/vendor/rocsparse/multiply_spgemm.hpp
+++ b/include/spblas/vendor/rocsparse/multiply_spgemm.hpp
@@ -202,6 +202,7 @@ class spgemm_state_t {
   void multiply_numeric(A&& a, B&& b, C&& c, D&& d) {
     auto a_base = __detail::get_ultimate_base(a);
     auto b_base = __detail::get_ultimate_base(b);
+    auto d_base = __detail::get_ultimate_base(d);
     using matrix_type = decltype(a_base);
     using input_type = decltype(b_base);
     using output_type = std::remove_reference_t<decltype(c)>;
@@ -213,6 +214,22 @@ class spgemm_state_t {
     auto beta_optional = __detail::get_scaling_factor(d);
     value_type beta = beta_optional.value_or(1);
 
+    // Update the pointer from the matrix but they must contains the same
+    // sparsity as the previous call.
+    __rocsparse::throw_if_error(rocsparse_csr_set_pointers(
+        this->mat_a_, a_base.rowptr().data(), a_base.colind().data(),
+        a_base.values().data()));
+    __rocsparse::throw_if_error(rocsparse_csr_set_pointers(
+        this->mat_b_, b_base.rowptr().data(), b_base.colind().data(),
+        b_base.values().data()));
+    __rocsparse::throw_if_error(rocsparse_csr_set_pointers(
+        this->mat_c_, c.rowptr().data(), c.colind().data(), c.values().data()));
+    if (d_base.values().data()) {
+      // when it is still a null matrix, we can not use set pointer function
+      __rocsparse::throw_if_error(rocsparse_csr_set_pointers(
+          this->mat_d_, d_base.rowptr().data(), d_base.colind().data(),
+          d_base.values().data()));
+    }
     __rocsparse::throw_if_error(rocsparse_spgemm(
         this->handle_.get(), rocsparse_operation_none, rocsparse_operation_none,
         &alpha, this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_,
diff --git a/test/gtest/device/spgemm_reuse_test.cpp b/test/gtest/device/spgemm_reuse_test.cpp
index 931c573..836190e 100644
--- a/test/gtest/device/spgemm_reuse_test.cpp
+++ b/test/gtest/device/spgemm_reuse_test.cpp
@@ -322,3 +322,127 @@ TEST(CsrView, SpGEMMReuse_BScaled) {
     }
   }
 }
+
+TEST(CsrView, SpGEMMReuseAndChangePointer) {
+  for (auto&& [m, k, nnz] : util::dims) {
+    for (auto&& n : {m, k}) {
+      auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] =
+          spblas::generate_csr<value_t, index_t, offset_t>(m, k, nnz);
+      thrust::device_vector<value_t> d_a_values(a_values);
+      thrust::device_vector<offset_t> d_a_rowptr(a_rowptr);
+      thrust::device_vector<index_t> d_a_colind(a_colind);
+      spblas::csr_view<value_t, index_t, offset_t> d_a(
+          d_a_values.data().get(), d_a_rowptr.data().get(),
+          d_a_colind.data().get(), a_shape, a_nnz);
+      spblas::csr_view<value_t, index_t, offset_t> a(a_values, a_rowptr,
+                                                     a_colind, a_shape, a_nnz);
+
+      auto [b_values, b_rowptr, b_colind, b_shape, b_nnz] =
+          spblas::generate_csr<value_t, index_t, offset_t>(k, n, nnz);
+      thrust::device_vector<value_t> d_b_values(b_values);
+      thrust::device_vector<offset_t> d_b_rowptr(b_rowptr);
+      thrust::device_vector<index_t> d_b_colind(b_colind);
+      spblas::csr_view<value_t, index_t, offset_t> d_b(
+          d_b_values.data().get(), d_b_rowptr.data().get(),
+          d_b_colind.data().get(), b_shape, b_nnz);
+      spblas::csr_view<value_t, index_t, offset_t> b(b_values, b_rowptr,
+                                                     b_colind, b_shape, b_nnz);
+
+      thrust::device_vector<offset_t> d_c_rowptr(m + 1);
+
+      spblas::csr_view<value_t, index_t, offset_t> d_c(
+          nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0);
+
+      spblas::spgemm_state_t state;
+      spblas::multiply_symbolic_compute(state, d_a, d_b, d_c);
+      auto nnz = state.result_nnz();
+      thrust::device_vector<value_t> d_c_values(nnz);
+      thrust::device_vector<index_t> d_c_colind(nnz);
+      std::span<value_t> d_c_values_span(d_c_values.data().get(), nnz);
+      std::span<offset_t> d_c_rowptr_span(d_c_rowptr.data().get(), m + 1);
+      std::span<index_t> d_c_colind_span(d_c_colind.data().get(), nnz);
+      d_c.update(d_c_values_span, d_c_rowptr_span, d_c_colind_span, {m, n},
+                 nnz);
+
+      spblas::multiply_symbolic_fill(state, d_a, d_b, d_c);
+      std::mt19937 g(0);
+      for (int i = 0; i < 3; i++) {
+        // regenerate value of a and b;
+        std::uniform_real_distribution val_dist(0.0, 100.0);
+        for (auto& v : a_values) {
+          v = val_dist(g);
+        }
+        for (auto& v : b_values) {
+          v = val_dist(g);
+        }
+        // create different pointers than the symbolic phase, but they still
+        // hold the same sparsity
+        thrust::device_vector<value_t> d_a_values_new(a_values);
+        thrust::device_vector<index_t> d_a_colind_new(d_a_colind);
+        thrust::device_vector<index_t> d_a_rowptr_new(d_a_rowptr);
+        thrust::device_vector<value_t> d_b_values_new(b_values);
+        thrust::device_vector<index_t> d_b_colind_new(d_b_colind);
+        thrust::device_vector<index_t> d_b_rowptr_new(d_b_rowptr);
+        thrust::device_vector<value_t> d_c_values_new(d_c_values);
+        thrust::device_vector<index_t> d_c_colind_new(d_c_colind);
+        thrust::device_vector<index_t> d_c_rowptr_new(d_c_rowptr);
+        spblas::csr_view<value_t, index_t, offset_t> d_a(
+            d_a_values_new.data().get(), d_a_rowptr_new.data().get(),
+            d_a_colind_new.data().get(), a_shape, a_nnz);
+        spblas::csr_view<value_t, index_t, offset_t> d_b(
+            d_b_values_new.data().get(), d_b_rowptr_new.data().get(),
+            d_b_colind_new.data().get(), b_shape, b_nnz);
+        spblas::csr_view<value_t, index_t, offset_t> d_c(
+            d_c_values_new.data().get(), d_c_rowptr_new.data().get(),
+            d_c_colind_new.data().get(), {m, n}, nnz);
+        // call numeric on new data
+        spblas::multiply_numeric(state, d_a, d_b, d_c);
+        // move c back to host memory
+        std::vector<value_t> c_values(nnz);
+        std::vector<offset_t> c_rowptr(m + 1);
+        std::vector<index_t> c_colind(nnz);
+        thrust::copy(d_c_values_new.begin(), d_c_values_new.end(),
+                     c_values.begin());
+        thrust::copy(d_c_rowptr_new.begin(), d_c_rowptr_new.end(),
+                     c_rowptr.begin());
+        thrust::copy(d_c_colind_new.begin(), d_c_colind_new.end(),
+                     c_colind.begin());
+
+        spblas::csr_view<value_t, index_t, offset_t> c(c_values, c_rowptr,
+                                                       c_colind, {m, n}, nnz);
+
+        spblas::__backend::spa_accumulator<value_t, index_t> c_row_ref(
+            spblas::__backend::shape(c)[1]);
+
+        spblas::__backend::spa_accumulator<value_t, index_t> c_row_acc(
+            spblas::__backend::shape(c)[1]);
+
+        for (auto&& [i, a_row] : spblas::__backend::rows(a)) {
+          c_row_ref.clear();
+          for (auto&& [k, a_v] : a_row) {
+            auto&& b_row = spblas::__backend::lookup_row(b, k);
+
+            for (auto&& [j, b_v] : b_row) {
+              c_row_ref[j] += a_v * b_v;
+            }
+          }
+
+          auto&& c_row = spblas::__backend::lookup_row(c, i);
+
+          // Accumulate output into `c_row_acc` so that we can allow
+          // duplicate column indices.
+          c_row_acc.clear();
+          for (auto&& [j, c_v] : c_row) {
+            c_row_acc[j] += c_v;
+          }
+
+          for (auto&& [j, c_v] : c_row) {
+            EXPECT_EQ_(c_row_ref[j], c_row_acc[j]);
+          }
+
+          EXPECT_EQ(c_row_ref.size(), c_row_acc.size());
+        }
+      }
+    }
+  }
+}

From 4595c8d43ca74f2101066ab54a47f2bc8e0a9f9e Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 8 May 2025 00:27:58 +0200
Subject: [PATCH 06/14] create a matrix descriptor helper

---
 .../spblas/vendor/rocsparse/descriptor.hpp    | 33 +++++++++++++++++
 include/spblas/vendor/rocsparse/multiply.hpp  | 10 ++----
 .../vendor/rocsparse/multiply_spgemm.hpp      | 35 +++----------------
 3 files changed, 40 insertions(+), 38 deletions(-)
 create mode 100644 include/spblas/vendor/rocsparse/descriptor.hpp

diff --git a/include/spblas/vendor/rocsparse/descriptor.hpp b/include/spblas/vendor/rocsparse/descriptor.hpp
new file mode 100644
index 0000000..e47e073
--- /dev/null
+++ b/include/spblas/vendor/rocsparse/descriptor.hpp
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <type_traits>
+
+#include <rocsparse/rocsparse.h>
+
+#include <spblas/concepts.hpp>
+#include <spblas/views/inspectors.hpp>
+
+#include "exception.hpp"
+#include "types.hpp"
+
+namespace spblas {
+namespace __rocsparse {
+
+// create matrix descriptor from spblas csr view
+template <matrix mat>
+  requires __detail::is_csr_view_v<mat>
+rocsparse_spmat_descr create_matrix_descr(mat&& a) {
+  using matrix_type = std::remove_cvref_t<mat>;
+  rocsparse_spmat_descr descr;
+  throw_if_error(rocsparse_create_csr_descr(
+      &descr, __backend::shape(a)[0], __backend::shape(a)[1], a.values().size(),
+      a.rowptr().data(), a.colind().data(), a.values().data(),
+      to_rocsparse_indextype<typename matrix_type::offset_type>(),
+      to_rocsparse_indextype<typename matrix_type::index_type>(),
+      rocsparse_index_base_zero,
+      to_rocsparse_datatype<typename matrix_type::scalar_type>()));
+  return descr;
+}
+
+} // namespace __rocsparse
+} // namespace spblas
diff --git a/include/spblas/vendor/rocsparse/multiply.hpp b/include/spblas/vendor/rocsparse/multiply.hpp
index 7d88c5c..212a5d1 100644
--- a/include/spblas/vendor/rocsparse/multiply.hpp
+++ b/include/spblas/vendor/rocsparse/multiply.hpp
@@ -10,6 +10,7 @@
 #include <spblas/detail/ranges.hpp>
 #include <spblas/detail/view_inspectors.hpp>
 
+#include "descriptor.hpp"
 #include "exception.hpp"
 #include "hip_allocator.hpp"
 #include "types.hpp"
@@ -59,14 +60,7 @@ class spmv_state_t {
     tensor_scalar_t<A> alpha = alpha_optional.value_or(1);
     auto handle = this->handle_.get();
 
-    rocsparse_spmat_descr mat;
-    __rocsparse::throw_if_error(rocsparse_create_csr_descr(
-        &mat, __backend::shape(a_base)[0], __backend::shape(a_base)[1],
-        a_base.values().size(), a_base.rowptr().data(), a_base.colind().data(),
-        a_base.values().data(),
-        to_rocsparse_indextype<typename matrix_type::offset_type>(),
-        to_rocsparse_indextype<typename matrix_type::index_type>(),
-        rocsparse_index_base_zero, to_rocsparse_datatype<value_type>()));
+    rocsparse_spmat_descr mat = __rocsparse::create_matrix_descr(a_base);
     rocsparse_dnvec_descr vecb;
     rocsparse_dnvec_descr vecc;
     __rocsparse::throw_if_error(rocsparse_create_dnvec_descr(
diff --git a/include/spblas/vendor/rocsparse/multiply_spgemm.hpp b/include/spblas/vendor/rocsparse/multiply_spgemm.hpp
index 74e6f70..5eae63e 100644
--- a/include/spblas/vendor/rocsparse/multiply_spgemm.hpp
+++ b/include/spblas/vendor/rocsparse/multiply_spgemm.hpp
@@ -10,6 +10,7 @@
 #include <spblas/detail/ranges.hpp>
 #include <spblas/detail/view_inspectors.hpp>
 
+#include "descriptor.hpp"
 #include "exception.hpp"
 #include "hip_allocator.hpp"
 #include "types.hpp"
@@ -85,36 +86,10 @@ class spgemm_state_t {
     value_type beta = beta_optional.value_or(1);
     auto handle = this->handle_.get();
     // Create sparse matrix A in CSR format
-    __rocsparse::throw_if_error(rocsparse_create_csr_descr(
-        &this->mat_a_, __backend::shape(a_base)[0], __backend::shape(a_base)[1],
-        a_base.values().size(), a_base.rowptr().data(), a_base.colind().data(),
-        a_base.values().data(),
-        to_rocsparse_indextype<typename matrix_type::offset_type>(),
-        to_rocsparse_indextype<typename matrix_type::index_type>(),
-        rocsparse_index_base_zero, to_rocsparse_datatype<value_type>()));
-    __rocsparse::throw_if_error(rocsparse_create_csr_descr(
-        &this->mat_b_, __backend::shape(b_base)[0], __backend::shape(b_base)[1],
-        b_base.values().size(), b_base.rowptr().data(), b_base.colind().data(),
-        b_base.values().data(),
-        to_rocsparse_indextype<typename input_type::offset_type>(),
-        to_rocsparse_indextype<typename input_type::index_type>(),
-        rocsparse_index_base_zero,
-        to_rocsparse_datatype<typename input_type::scalar_type>()));
-    __rocsparse::throw_if_error(rocsparse_create_csr_descr(
-        &this->mat_c_, __backend::shape(a_base)[0], __backend::shape(b_base)[1],
-        0, c.rowptr().data(), NULL, NULL,
-        to_rocsparse_indextype<typename output_type::offset_type>(),
-        to_rocsparse_indextype<typename output_type::index_type>(),
-        rocsparse_index_base_zero,
-        to_rocsparse_datatype<typename output_type::scalar_type>()));
-    __rocsparse::throw_if_error(rocsparse_create_csr_descr(
-        &this->mat_d_, __backend::shape(d_base)[0], __backend::shape(d_base)[1],
-        d_base.values().size(), d_base.rowptr().data(), d_base.colind().data(),
-        d_base.values().data(),
-        to_rocsparse_indextype<typename output_type::offset_type>(),
-        to_rocsparse_indextype<typename output_type::index_type>(),
-        rocsparse_index_base_zero,
-        to_rocsparse_datatype<typename output_type::scalar_type>()));
+    this->mat_a_ = __rocsparse::create_matrix_descr(a_base);
+    this->mat_b_ = __rocsparse::create_matrix_descr(b_base);
+    this->mat_c_ = __rocsparse::create_matrix_descr(c);
+    this->mat_d_ = __rocsparse::create_matrix_descr(d_base);
     //--------------------------------------------------------------------------
     // SpGEMM Computation
     // ask buffer_size bytes for external memory

From eeb3a674fb995d3e7a41b919e4c7197c78ddc703 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 8 May 2025 00:36:35 +0200
Subject: [PATCH 07/14] add vector descriptor helper

---
 include/spblas/vendor/rocsparse/descriptor.hpp | 12 ++++++++++++
 include/spblas/vendor/rocsparse/multiply.hpp   | 10 ++--------
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/include/spblas/vendor/rocsparse/descriptor.hpp b/include/spblas/vendor/rocsparse/descriptor.hpp
index e47e073..cf45934 100644
--- a/include/spblas/vendor/rocsparse/descriptor.hpp
+++ b/include/spblas/vendor/rocsparse/descriptor.hpp
@@ -29,5 +29,17 @@ rocsparse_spmat_descr create_matrix_descr(mat&& a) {
   return descr;
 }
 
+// create dense vector from mdspan
+template <vector vec>
+  requires __ranges::contiguous_range<vec>
+rocsparse_dnvec_descr create_vector_descr(vec&& v) {
+  using vector_type = std::remove_cvref_t<vec>;
+  rocsparse_dnvec_descr descr;
+  throw_if_error(rocsparse_create_dnvec_descr(
+      &descr, v.size(), v.data(),
+      to_rocsparse_datatype<typename vector_type::value_type>()));
+  return descr;
+}
+
 } // namespace __rocsparse
 } // namespace spblas
diff --git a/include/spblas/vendor/rocsparse/multiply.hpp b/include/spblas/vendor/rocsparse/multiply.hpp
index 212a5d1..d05ff98 100644
--- a/include/spblas/vendor/rocsparse/multiply.hpp
+++ b/include/spblas/vendor/rocsparse/multiply.hpp
@@ -61,14 +61,8 @@ class spmv_state_t {
     auto handle = this->handle_.get();
 
     rocsparse_spmat_descr mat = __rocsparse::create_matrix_descr(a_base);
-    rocsparse_dnvec_descr vecb;
-    rocsparse_dnvec_descr vecc;
-    __rocsparse::throw_if_error(rocsparse_create_dnvec_descr(
-        &vecb, b_base.size(), b_base.data(),
-        to_rocsparse_datatype<typename input_type::value_type>()));
-    __rocsparse::throw_if_error(rocsparse_create_dnvec_descr(
-        &vecc, c.size(), c.data(),
-        to_rocsparse_datatype<typename output_type::value_type>()));
+    rocsparse_dnvec_descr vecb = __rocsparse::create_vector_descr(b_base);
+    rocsparse_dnvec_descr vecc = __rocsparse::create_vector_descr(c);
     value_type alpha_val = alpha;
     value_type beta = 0.0;
     long unsigned int buffer_size = 0;

From 59c0b7d0aac5e44ece6edd299a22f36b0a138605 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 22 May 2025 16:24:31 +0200
Subject: [PATCH 08/14] review update

---
 include/spblas/vendor/rocsparse/multiply_spgemm.hpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/include/spblas/vendor/rocsparse/multiply_spgemm.hpp b/include/spblas/vendor/rocsparse/multiply_spgemm.hpp
index 5eae63e..c6ede83 100644
--- a/include/spblas/vendor/rocsparse/multiply_spgemm.hpp
+++ b/include/spblas/vendor/rocsparse/multiply_spgemm.hpp
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <cstdint>
 #include <functional>
 #include <memory>
 #include <type_traits>
@@ -85,19 +86,19 @@ class spgemm_state_t {
     auto beta_optional = __detail::get_scaling_factor(d);
     value_type beta = beta_optional.value_or(1);
     auto handle = this->handle_.get();
-    // Create sparse matrix A in CSR format
+    // Create sparse matrix in CSR format
     this->mat_a_ = __rocsparse::create_matrix_descr(a_base);
     this->mat_b_ = __rocsparse::create_matrix_descr(b_base);
     this->mat_c_ = __rocsparse::create_matrix_descr(c);
     this->mat_d_ = __rocsparse::create_matrix_descr(d_base);
-    //--------------------------------------------------------------------------
-    // SpGEMM Computation
     // ask buffer_size bytes for external memory
     __rocsparse::throw_if_error(rocsparse_spgemm(
         handle, rocsparse_operation_none, rocsparse_operation_none, &alpha,
         this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_,
         to_rocsparse_datatype<value_type>(), rocsparse_spgemm_alg_default,
         rocsparse_spgemm_stage_buffer_size, &buffer_size, nullptr));
+    // allocate the new buffer if it requires more than what the buffer
+    // currently has.
     if (buffer_size > this->buffer_size_) {
       this->alloc_.deallocate(workspace_, this->buffer_size_);
       this->buffer_size_ = buffer_size;
@@ -113,6 +114,7 @@ class spgemm_state_t {
     int64_t c_num_cols;
     __rocsparse::throw_if_error(rocsparse_spmat_get_size(
         this->mat_c_, &c_num_rows, &c_num_cols, &this->result_nnz_));
+    // form a shape
     this->result_shape_ = index<index_t>(c_num_rows, c_num_cols);
   }
 
@@ -218,10 +220,10 @@ class spgemm_state_t {
                       std::function<void(rocsparse_handle)>>;
   handle_manager handle_;
   rocsparse::hip_allocator<char> alloc_;
-  long unsigned int buffer_size_;
+  std::uint64_t buffer_size_;
   char* workspace_;
   index<index_t> result_shape_;
-  index_t result_nnz_;
+  std::int64_t result_nnz_;
   rocsparse_spmat_descr mat_a_;
   rocsparse_spmat_descr mat_b_;
   rocsparse_spmat_descr mat_c_;

From 5f9c0905727f4efcad83e1153954e0ea0399ef3b Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 7 Aug 2025 16:13:58 +0200
Subject: [PATCH 09/14] fix and adapt the changes from main

---
 .../spblas/vendor/rocsparse/descriptor.hpp    | 45 -------------------
 .../vendor/rocsparse/multiply_spgemm.hpp      | 19 ++++----
 .../device/rocsparse/spgemm_4args_test.cpp    |  8 ++--
 test/gtest/device/spgemm_reuse_test.cpp       |  8 ++--
 test/gtest/device/spgemm_test.cpp             |  6 +--
 5 files changed, 20 insertions(+), 66 deletions(-)
 delete mode 100644 include/spblas/vendor/rocsparse/descriptor.hpp

diff --git a/include/spblas/vendor/rocsparse/descriptor.hpp b/include/spblas/vendor/rocsparse/descriptor.hpp
deleted file mode 100644
index cf45934..0000000
--- a/include/spblas/vendor/rocsparse/descriptor.hpp
+++ /dev/null
@@ -1,45 +0,0 @@
-#pragma once
-
-#include <type_traits>
-
-#include <rocsparse/rocsparse.h>
-
-#include <spblas/concepts.hpp>
-#include <spblas/views/inspectors.hpp>
-
-#include "exception.hpp"
-#include "types.hpp"
-
-namespace spblas {
-namespace __rocsparse {
-
-// create matrix descriptor from spblas csr view
-template <matrix mat>
-  requires __detail::is_csr_view_v<mat>
-rocsparse_spmat_descr create_matrix_descr(mat&& a) {
-  using matrix_type = std::remove_cvref_t<mat>;
-  rocsparse_spmat_descr descr;
-  throw_if_error(rocsparse_create_csr_descr(
-      &descr, __backend::shape(a)[0], __backend::shape(a)[1], a.values().size(),
-      a.rowptr().data(), a.colind().data(), a.values().data(),
-      to_rocsparse_indextype<typename matrix_type::offset_type>(),
-      to_rocsparse_indextype<typename matrix_type::index_type>(),
-      rocsparse_index_base_zero,
-      to_rocsparse_datatype<typename matrix_type::scalar_type>()));
-  return descr;
-}
-
-// create dense vector from mdspan
-template <vector vec>
-  requires __ranges::contiguous_range<vec>
-rocsparse_dnvec_descr create_vector_descr(vec&& v) {
-  using vector_type = std::remove_cvref_t<vec>;
-  rocsparse_dnvec_descr descr;
-  throw_if_error(rocsparse_create_dnvec_descr(
-      &descr, v.size(), v.data(),
-      to_rocsparse_datatype<typename vector_type::value_type>()));
-  return descr;
-}
-
-} // namespace __rocsparse
-} // namespace spblas
diff --git a/include/spblas/vendor/rocsparse/multiply_spgemm.hpp b/include/spblas/vendor/rocsparse/multiply_spgemm.hpp
index c6ede83..0b58f37 100644
--- a/include/spblas/vendor/rocsparse/multiply_spgemm.hpp
+++ b/include/spblas/vendor/rocsparse/multiply_spgemm.hpp
@@ -11,7 +11,6 @@
 #include <spblas/detail/ranges.hpp>
 #include <spblas/detail/view_inspectors.hpp>
 
-#include "descriptor.hpp"
 #include "exception.hpp"
 #include "hip_allocator.hpp"
 #include "types.hpp"
@@ -87,15 +86,15 @@ class spgemm_state_t {
     value_type beta = beta_optional.value_or(1);
     auto handle = this->handle_.get();
     // Create sparse matrix in CSR format
-    this->mat_a_ = __rocsparse::create_matrix_descr(a_base);
-    this->mat_b_ = __rocsparse::create_matrix_descr(b_base);
-    this->mat_c_ = __rocsparse::create_matrix_descr(c);
-    this->mat_d_ = __rocsparse::create_matrix_descr(d_base);
+    this->mat_a_ = __rocsparse::create_rocsparse_handle(a_base);
+    this->mat_b_ = __rocsparse::create_rocsparse_handle(b_base);
+    this->mat_c_ = __rocsparse::create_rocsparse_handle(c);
+    this->mat_d_ = __rocsparse::create_rocsparse_handle(d_base);
     // ask buffer_size bytes for external memory
     __rocsparse::throw_if_error(rocsparse_spgemm(
         handle, rocsparse_operation_none, rocsparse_operation_none, &alpha,
         this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_,
-        to_rocsparse_datatype<value_type>(), rocsparse_spgemm_alg_default,
+        detail::rocsparse_data_type_v<value_type>, rocsparse_spgemm_alg_default,
         rocsparse_spgemm_stage_buffer_size, &buffer_size, nullptr));
     // allocate the new buffer if it requires more than what the buffer
     // currently has.
@@ -107,7 +106,7 @@ class spgemm_state_t {
     __rocsparse::throw_if_error(rocsparse_spgemm(
         handle, rocsparse_operation_none, rocsparse_operation_none, &alpha,
         this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_,
-        to_rocsparse_datatype<value_type>(), rocsparse_spgemm_alg_default,
+        detail::rocsparse_data_type_v<value_type>, rocsparse_spgemm_alg_default,
         rocsparse_spgemm_stage_nnz, &this->buffer_size_, this->workspace_));
     // get matrix C non-zero entries and size
     int64_t c_num_rows;
@@ -141,7 +140,7 @@ class spgemm_state_t {
     __rocsparse::throw_if_error(rocsparse_spgemm(
         handle_.get(), rocsparse_operation_none, rocsparse_operation_none,
         &alpha, this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_,
-        to_rocsparse_datatype<value_type>(), rocsparse_spgemm_alg_default,
+        detail::rocsparse_data_type_v<value_type>, rocsparse_spgemm_alg_default,
         rocsparse_spgemm_stage_compute, &this->buffer_size_, workspace_));
   }
 
@@ -168,7 +167,7 @@ class spgemm_state_t {
     __rocsparse::throw_if_error(rocsparse_spgemm(
         this->handle_.get(), rocsparse_operation_none, rocsparse_operation_none,
         &alpha, this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_,
-        to_rocsparse_datatype<value_type>(), rocsparse_spgemm_alg_default,
+        detail::rocsparse_data_type_v<value_type>, rocsparse_spgemm_alg_default,
         rocsparse_spgemm_stage_symbolic, &this->buffer_size_,
         this->workspace_));
   }
@@ -210,7 +209,7 @@ class spgemm_state_t {
     __rocsparse::throw_if_error(rocsparse_spgemm(
         this->handle_.get(), rocsparse_operation_none, rocsparse_operation_none,
         &alpha, this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_,
-        to_rocsparse_datatype<value_type>(), rocsparse_spgemm_alg_default,
+        detail::rocsparse_data_type_v<value_type>, rocsparse_spgemm_alg_default,
         rocsparse_spgemm_stage_numeric, &this->buffer_size_, this->workspace_));
   }
 
diff --git a/test/gtest/device/rocsparse/spgemm_4args_test.cpp b/test/gtest/device/rocsparse/spgemm_4args_test.cpp
index f1b272b..1a8e75a 100644
--- a/test/gtest/device/rocsparse/spgemm_4args_test.cpp
+++ b/test/gtest/device/rocsparse/spgemm_4args_test.cpp
@@ -9,7 +9,7 @@ using value_t = float;
 using index_t = spblas::index_t;
 using offset_t = spblas::offset_t;
 
-TEST(CsrView, SpGEMM_4Args) {
+TEST(thrust_CsrView, SpGEMM_4Args) {
   for (auto&& [m, k, nnz] : util::dims) {
     for (auto&& n : {m, k}) {
       auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] =
@@ -111,7 +111,7 @@ TEST(CsrView, SpGEMM_4Args) {
   }
 }
 
-TEST(CsrView, SpGEMM_4Args_AScaled) {
+TEST(thrust_CsrView, SpGEMM_4Args_AScaled) {
   value_t alpha = 2.0f;
   for (auto&& [m, k, nnz] : util::dims) {
     for (auto&& n : {m, k}) {
@@ -214,7 +214,7 @@ TEST(CsrView, SpGEMM_4Args_AScaled) {
   }
 }
 
-TEST(CsrView, SpGEMM_4Args_BScaled) {
+TEST(thrust_CsrView, SpGEMM_4Args_BScaled) {
   value_t alpha = 2.0f;
   for (auto&& [m, k, nnz] : util::dims) {
     for (auto&& n : {m, k}) {
@@ -317,7 +317,7 @@ TEST(CsrView, SpGEMM_4Args_BScaled) {
   }
 }
 
-TEST(CsrView, SpGEMM_4Args_DScaled) {
+TEST(thrust_CsrView, SpGEMM_4Args_DScaled) {
   value_t alpha = 2.0f;
   for (auto&& [m, k, nnz] : util::dims) {
     for (auto&& n : {m, k}) {
diff --git a/test/gtest/device/spgemm_reuse_test.cpp b/test/gtest/device/spgemm_reuse_test.cpp
index 836190e..b798d6e 100644
--- a/test/gtest/device/spgemm_reuse_test.cpp
+++ b/test/gtest/device/spgemm_reuse_test.cpp
@@ -9,7 +9,7 @@ using value_t = float;
 using index_t = spblas::index_t;
 using offset_t = spblas::offset_t;
 
-TEST(CsrView, SpGEMMReuse) {
+TEST(thrust_CsrView, SpGEMMReuse) {
   for (auto&& [m, k, nnz] : util::dims) {
     for (auto&& n : {m, k}) {
       auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] =
@@ -113,7 +113,7 @@ TEST(CsrView, SpGEMMReuse) {
   }
 }
 
-TEST(CsrView, SpGEMMReuse_AScaled) {
+TEST(thrust_CsrView, SpGEMMReuse_AScaled) {
   value_t alpha = 2.0f;
   for (auto&& [m, k, nnz] : util::dims) {
     for (auto&& n : {m, k}) {
@@ -218,7 +218,7 @@ TEST(CsrView, SpGEMMReuse_AScaled) {
   }
 }
 
-TEST(CsrView, SpGEMMReuse_BScaled) {
+TEST(thrust_CsrView, SpGEMMReuse_BScaled) {
   value_t alpha = 2.0f;
   for (auto&& [m, k, nnz] : util::dims) {
     for (auto&& n : {m, k}) {
@@ -323,7 +323,7 @@ TEST(CsrView, SpGEMMReuse_BScaled) {
   }
 }
 
-TEST(CsrView, SpGEMMReuseAndChangePointer) {
+TEST(thrust_CsrView, SpGEMMReuseAndChangePointer) {
   for (auto&& [m, k, nnz] : util::dims) {
     for (auto&& n : {m, k}) {
       auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] =
diff --git a/test/gtest/device/spgemm_test.cpp b/test/gtest/device/spgemm_test.cpp
index 4662ffd..98e1aed 100644
--- a/test/gtest/device/spgemm_test.cpp
+++ b/test/gtest/device/spgemm_test.cpp
@@ -9,7 +9,7 @@ using value_t = float;
 using index_t = spblas::index_t;
 using offset_t = spblas::offset_t;
 
-TEST(CsrView, SpGEMM) {
+TEST(thrust_CsrView, SpGEMM) {
   for (auto&& [m, k, nnz] : util::dims) {
     for (auto&& n : {m, k}) {
       auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] =
@@ -96,7 +96,7 @@ TEST(CsrView, SpGEMM) {
   }
 }
 
-TEST(CsrView, SpGEMM_AScaled) {
+TEST(thrust_CsrView, SpGEMM_AScaled) {
   value_t alpha = 2.0f;
   for (auto&& [m, k, nnz] : util::dims) {
     for (auto&& n : {m, k}) {
@@ -184,7 +184,7 @@ TEST(CsrView, SpGEMM_AScaled) {
   }
 }
 
-TEST(CsrView, SpGEMM_BScaled) {
+TEST(thrust_CsrView, SpGEMM_BScaled) {
   value_t alpha = 2.0f;
   for (auto&& [m, k, nnz] : util::dims) {
     for (auto&& n : {m, k}) {

From e36a51616e67c1fcb52f26d2987bc5ac50e80510 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 20 Aug 2025 16:08:22 +0200
Subject: [PATCH 10/14] fix meaningless cmake function

Co-authored-by: Benjamin Brock <brock@cs.berkeley.edu>
---
 test/gtest/CMakeLists.txt | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/test/gtest/CMakeLists.txt b/test/gtest/CMakeLists.txt
index 84248cf..cd96de1 100644
--- a/test/gtest/CMakeLists.txt
+++ b/test/gtest/CMakeLists.txt
@@ -13,16 +13,12 @@ if (SPBLAS_CPU_BACKEND)
        transpose_test.cpp
        triangular_solve_test.cpp)
 endif()
-function(add_rocsparse_lang file_list)
-  if (ENABLE_ROCSPARSE)
-    set_source_files_properties(${${file_list}} PROPERTIES LANGUAGE HIP)
-  endif()
-endfunction()
+
 # GPU tests
 if (SPBLAS_GPU_BACKEND)
   if (ENABLE_ROCSPARSE)
     set(GPUTEST_SOURCES device/spmv_test.cpp device/spgemm_test.cpp device/spgemm_reuse_test.cpp device/rocsparse/spgemm_4args_test.cpp)
-    add_rocsparse_lang(GPUTEST_SOURCES)
+    set_source_files_properties(${GPUTEST_SOURCES} PROPERTIES LANGUAGE HIP)
   else ()
     set(GPUTEST_SOURCES device/spmv_test.cpp)
   endif ()

From 6bfe91c1b7ac50b7ee31d8fc4e246ebc07f370e3 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 24 Apr 2025 19:10:35 +0200
Subject: [PATCH 11/14] add cusparse spgemm

---
 CMakeLists.txt                                |   1 +
 include/spblas/vendor/cusparse/cusparse.hpp   |   1 +
 include/spblas/vendor/cusparse/exception.hpp  |   4 +-
 .../vendor/cusparse/multiply_spgemm.hpp       | 211 ++++++++++++++++++
 test/gtest/CMakeLists.txt                     |   2 +-
 test/gtest/device/spgemm_test.cpp             |  40 ++--
 6 files changed, 236 insertions(+), 23 deletions(-)
 create mode 100644 include/spblas/vendor/cusparse/multiply_spgemm.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7104450..a39415a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -90,6 +90,7 @@ endif()
 if (ENABLE_CUSPARSE)
   set(SPBLAS_GPU_BACKEND ON)
   find_package(CUDAToolkit REQUIRED)
+  enable_language(CUDA)
   target_link_libraries(spblas INTERFACE CUDA::cudart CUDA::cusparse CUDA::cublas)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSPBLAS_ENABLE_CUSPARSE")
 endif()
diff --git a/include/spblas/vendor/cusparse/cusparse.hpp b/include/spblas/vendor/cusparse/cusparse.hpp
index 7caa698..014b2ba 100644
--- a/include/spblas/vendor/cusparse/cusparse.hpp
+++ b/include/spblas/vendor/cusparse/cusparse.hpp
@@ -1,3 +1,4 @@
 #pragma once
 
 #include "multiply.hpp"
+#include "multiply_spgemm.hpp"
diff --git a/include/spblas/vendor/cusparse/exception.hpp b/include/spblas/vendor/cusparse/exception.hpp
index d90ac30..4be80eb 100644
--- a/include/spblas/vendor/cusparse/exception.hpp
+++ b/include/spblas/vendor/cusparse/exception.hpp
@@ -10,7 +10,7 @@ namespace spblas {
 namespace __cusparse {
 
 // Throw an exception if the cudaError_t is not cudaSuccess.
-void throw_if_error(cudaError_t error_code, std::string prefix = "") {
+inline void throw_if_error(cudaError_t error_code, std::string prefix = "") {
   if (error_code == cudaSuccess) {
     return;
   }
@@ -21,7 +21,7 @@ void throw_if_error(cudaError_t error_code, std::string prefix = "") {
 }
 
 // Throw an exception if the cusparseStatus_t is not CUSPARSE_STATUS_SUCCESS.
-void throw_if_error(cusparseStatus_t error_code) {
+inline void throw_if_error(cusparseStatus_t error_code) {
   if (error_code == CUSPARSE_STATUS_SUCCESS) {
     return;
   } else if (error_code == CUSPARSE_STATUS_NOT_INITIALIZED) {
diff --git a/include/spblas/vendor/cusparse/multiply_spgemm.hpp b/include/spblas/vendor/cusparse/multiply_spgemm.hpp
new file mode 100644
index 0000000..8b2e181
--- /dev/null
+++ b/include/spblas/vendor/cusparse/multiply_spgemm.hpp
@@ -0,0 +1,211 @@
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <type_traits>
+
+#include <cuda_runtime.h>
+#include <cusparse.h>
+
+#include <spblas/detail/ranges.hpp>
+#include <spblas/detail/view_inspectors.hpp>
+
+#include "cuda_allocator.hpp"
+#include "exception.hpp"
+#include "types.hpp"
+
+namespace spblas {
+
+class spgemm_state_t {
+public:
+  spgemm_state_t() : spgemm_state_t(cusparse::cuda_allocator<char>{}) {}
+
+  spgemm_state_t(cusparse::cuda_allocator<char> alloc)
+      : alloc_(alloc), buffer_size_1_(0), buffer_size_2_(0),
+        workspace_1_(nullptr), workspace_2_(nullptr), result_nnz_(0),
+        result_shape_(0, 0) {
+    cusparseHandle_t handle;
+    __cusparse::throw_if_error(cusparseCreate(&handle));
+    if (auto stream = alloc.stream()) {
+      cusparseSetStream(handle, stream);
+    }
+    handle_ = handle_manager(handle, [](cusparseHandle_t handle) {
+      __cusparse::throw_if_error(cusparseDestroy(handle));
+    });
+    __cusparse::throw_if_error(cusparseSpGEMM_createDescr(&descr_));
+  }
+
+  spgemm_state_t(cusparse::cuda_allocator<char> alloc, cusparseHandle_t handle)
+      : alloc_(alloc), buffer_size_1_(0), buffer_size_2_(0),
+        workspace_1_(nullptr), workspace_2_(nullptr), result_nnz_(0),
+        result_shape_(0, 0) {
+    handle_ = handle_manager(handle, [](cusparseHandle_t handle) {
+      // it is provided by user, we do not delete it at all.
+    });
+    __cusparse::throw_if_error(cusparseSpGEMM_createDescr(&descr_));
+  }
+
+  ~spgemm_state_t() {
+    alloc_.deallocate(workspace_1_, buffer_size_1_);
+    alloc_.deallocate(workspace_2_, buffer_size_2_);
+    __cusparse::throw_if_error(cusparseDestroySpMat(mat_a_));
+    __cusparse::throw_if_error(cusparseDestroySpMat(mat_b_));
+    __cusparse::throw_if_error(cusparseDestroySpMat(mat_c_));
+    __cusparse::throw_if_error(cusparseSpGEMM_destroyDescr(descr_));
+  }
+
+  auto result_shape() {
+    return this->result_shape_;
+  }
+
+  auto result_nnz() {
+    return this->result_nnz_;
+  }
+
+  template <matrix A, matrix B, matrix C>
+    requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+             __detail::is_csr_view_v<C>
+  void multiply_compute(A&& a, B&& b, C&& c) {
+    auto a_base = __detail::get_ultimate_base(a);
+    auto b_base = __detail::get_ultimate_base(b);
+    using matrix_type = decltype(a_base);
+    using input_type = decltype(b_base);
+    using output_type = std::remove_reference_t<decltype(c)>;
+    using value_type = typename matrix_type::scalar_type;
+    size_t buffer_size = 0;
+
+    auto alpha_optional = __detail::get_scaling_factor(a, b);
+    value_type alpha = alpha_optional.value_or(1);
+    value_type beta = 1;
+    auto handle = this->handle_.get();
+    // Create sparse matrix A in CSR format
+    __cusparse::throw_if_error(cusparseCreateCsr(
+        &mat_a_, __backend::shape(a_base)[0], __backend::shape(a_base)[1],
+        a_base.values().size(), a_base.rowptr().data(), a_base.colind().data(),
+        a_base.values().data(),
+        to_cusparse_indextype<typename matrix_type::offset_type>(),
+        to_cusparse_indextype<typename matrix_type::index_type>(),
+        CUSPARSE_INDEX_BASE_ZERO, to_cuda_datatype<value_type>()));
+    __cusparse::throw_if_error(cusparseCreateCsr(
+        &mat_b_, __backend::shape(b_base)[0], __backend::shape(b_base)[1],
+        b_base.values().size(), b_base.rowptr().data(), b_base.colind().data(),
+        b_base.values().data(),
+        to_cusparse_indextype<typename matrix_type::offset_type>(),
+        to_cusparse_indextype<typename matrix_type::index_type>(),
+        CUSPARSE_INDEX_BASE_ZERO, to_cuda_datatype<value_type>()));
+    __cusparse::throw_if_error(cusparseCreateCsr(
+        &mat_c_, __backend::shape(a)[0], __backend::shape(b)[1], 0,
+        c.rowptr().data(), NULL, NULL,
+        to_cusparse_indextype<typename matrix_type::offset_type>(),
+        to_cusparse_indextype<typename matrix_type::index_type>(),
+        CUSPARSE_INDEX_BASE_ZERO, to_cuda_datatype<value_type>()));
+
+    // ask bufferSize1 bytes for external memory
+    size_t buffer_size_1 = 0;
+    __cusparse::throw_if_error(cusparseSpGEMM_workEstimation(
+        handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+        CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_, mat_b_, &beta, mat_c_,
+        to_cuda_datatype<value_type>(), CUSPARSE_SPGEMM_DEFAULT, this->descr_,
+        &buffer_size_1, NULL));
+    if (buffer_size_1 > this->buffer_size_1_) {
+      this->alloc_.deallocate(this->workspace_1_, buffer_size_1_);
+      this->buffer_size_1_ = buffer_size_1;
+      this->workspace_1_ = this->alloc_.allocate(buffer_size_1);
+    }
+    // inspect the matrices A and B to understand the memory requirement for
+    // the next step
+    __cusparse::throw_if_error(cusparseSpGEMM_workEstimation(
+        handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+        CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_, mat_b_, &beta, mat_c_,
+        to_cuda_datatype<value_type>(), CUSPARSE_SPGEMM_DEFAULT, this->descr_,
+        &buffer_size_1, this->workspace_1_));
+
+    // ask buffer_size_2 bytes for external memory
+    size_t buffer_size_2 = 0;
+    __cusparse::throw_if_error(cusparseSpGEMM_compute(
+        handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+        CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_, mat_b_, &beta, mat_c_,
+        to_cuda_datatype<value_type>(), CUSPARSE_SPGEMM_DEFAULT, this->descr_,
+        &buffer_size_2, NULL));
+    if (buffer_size_2 > this->buffer_size_2_) {
+      this->alloc_.deallocate(this->workspace_2_, buffer_size_2_);
+      this->buffer_size_2_ = buffer_size_2;
+      this->workspace_2_ = this->alloc_.allocate(buffer_size_2);
+    }
+
+    // compute the intermediate product of A * B
+    cusparseSpGEMM_compute(
+        handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+        CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_, mat_b_, &beta, mat_c_,
+        to_cuda_datatype<value_type>(), CUSPARSE_SPGEMM_DEFAULT, this->descr_,
+        &buffer_size_2, this->workspace_2_);
+    // get matrix C non-zero entries c_nnz
+    int64_t c_num_rows, c_num_cols, c_nnz;
+    cusparseSpMatGetSize(mat_c_, &c_num_rows, &c_num_cols, &c_nnz);
+    this->result_nnz_ = c_nnz;
+    this->result_shape_ = index<index_t>(c_num_rows, c_num_cols);
+  }
+
+  // C = AB
+  template <matrix A, matrix B, matrix C>
+    requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+             __detail::is_csr_view_v<C>
+  void multiply_fill(A&& a, B&& b, C&& c) {
+    auto a_base = __detail::get_ultimate_base(a);
+    auto b_base = __detail::get_ultimate_base(b);
+    using matrix_type = decltype(a_base);
+    using input_type = decltype(b_base);
+    using output_type = std::remove_reference_t<decltype(c)>;
+    using value_type = typename matrix_type::scalar_type;
+
+    auto alpha_optional = __detail::get_scaling_factor(a, b);
+    value_type alpha = alpha_optional.value_or(1);
+    value_type beta = 1;
+    auto handle = this->handle_.get();
+    __cusparse::throw_if_error(cusparseCsrSetPointers(
+        mat_c_, c.rowptr().data(), c.colind().data(), c.values().data()));
+
+    __cusparse::throw_if_error(cusparseSpGEMM_copy(
+        handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+        CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_, mat_b_, &beta, mat_c_,
+        to_cuda_datatype<value_type>(), CUSPARSE_SPGEMM_DEFAULT, this->descr_));
+  }
+
+private:
+  using handle_manager =
+      std::unique_ptr<std::pointer_traits<cusparseHandle_t>::element_type,
+                      std::function<void(cusparseHandle_t)>>;
+  handle_manager handle_;
+  cusparse::cuda_allocator<char> alloc_;
+  size_t buffer_size_1_;
+  size_t buffer_size_2_;
+  char* workspace_1_;
+  char* workspace_2_;
+  index<index_t> result_shape_;
+  index_t result_nnz_;
+  cusparseSpMatDescr_t mat_a_;
+  cusparseSpMatDescr_t mat_b_;
+  cusparseSpMatDescr_t mat_c_;
+  cusparseSpGEMMDescr_t descr_;
+};
+
+template <matrix A, matrix B, matrix C>
+  requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+           __detail::is_csr_view_v<C>
+void multiply_inspect(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c) {}
+
+template <matrix A, matrix B, matrix C>
+  requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+           __detail::is_csr_view_v<C>
+void multiply_compute(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c) {
+  spgemm_handle.multiply_compute(a, b, c);
+}
+
+template <matrix A, matrix B, matrix C>
+  requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+           __detail::is_csr_view_v<C>
+void multiply_fill(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c) {
+  spgemm_handle.multiply_fill(a, b, c);
+}
+
+} // namespace spblas
diff --git a/test/gtest/CMakeLists.txt b/test/gtest/CMakeLists.txt
index cd96de1..4a468da 100644
--- a/test/gtest/CMakeLists.txt
+++ b/test/gtest/CMakeLists.txt
@@ -20,7 +20,7 @@ if (SPBLAS_GPU_BACKEND)
     set(GPUTEST_SOURCES device/spmv_test.cpp device/spgemm_test.cpp device/spgemm_reuse_test.cpp device/rocsparse/spgemm_4args_test.cpp)
     set_source_files_properties(${GPUTEST_SOURCES} PROPERTIES LANGUAGE HIP)
   else ()
-    set(GPUTEST_SOURCES device/spmv_test.cpp)
+    set(GPUTEST_SOURCES device/spmv_test.cpp device/spgemm_test.cpp)
   endif ()
   list(APPEND TEST_SOURCES ${GPUTEST_SOURCES})
 endif()
diff --git a/test/gtest/device/spgemm_test.cpp b/test/gtest/device/spgemm_test.cpp
index 98e1aed..03d5d1f 100644
--- a/test/gtest/device/spgemm_test.cpp
+++ b/test/gtest/device/spgemm_test.cpp
@@ -33,8 +33,10 @@ TEST(thrust_CsrView, SpGEMM) {
           d_b_colind.data().get(), b_shape, b_nnz);
       spblas::csr_view<value_t, index_t, offset_t> b(b_values, b_rowptr,
                                                      b_colind, b_shape, b_nnz);
-
-      thrust::device_vector<offset_t> d_c_rowptr(m + 1);
+      std::vector<offset_t> c_rowptr(m + 1);
+      // Can not create the vector with the size because it will use for_each to
+      // initilize the value in cuda.
+      thrust::device_vector<offset_t> d_c_rowptr(c_rowptr);
 
       spblas::csr_view<value_t, index_t, offset_t> d_c(
           nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0);
@@ -42,8 +44,10 @@ TEST(thrust_CsrView, SpGEMM) {
       spblas::spgemm_state_t state;
       spblas::multiply_compute(state, d_a, d_b, d_c);
       auto nnz = state.result_nnz();
-      thrust::device_vector<value_t> d_c_values(nnz);
-      thrust::device_vector<index_t> d_c_colind(nnz);
+      std::vector<value_t> c_values(nnz);
+      std::vector<index_t> c_colind(nnz);
+      thrust::device_vector<value_t> d_c_values(c_values);
+      thrust::device_vector<index_t> d_c_colind(c_colind);
       std::span<value_t> d_c_values_span(d_c_values.data().get(), nnz);
       std::span<offset_t> d_c_rowptr_span(d_c_rowptr.data().get(), m + 1);
       std::span<index_t> d_c_colind_span(d_c_colind.data().get(), nnz);
@@ -52,9 +56,6 @@ TEST(thrust_CsrView, SpGEMM) {
 
       spblas::multiply_fill(state, d_a, d_b, d_c);
 
-      std::vector<value_t> c_values(nnz);
-      std::vector<offset_t> c_rowptr(m + 1);
-      std::vector<index_t> c_colind(nnz);
       thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin());
       thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin());
       thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin());
@@ -121,8 +122,8 @@ TEST(thrust_CsrView, SpGEMM_AScaled) {
           d_b_colind.data().get(), b_shape, b_nnz);
       spblas::csr_view<value_t, index_t, offset_t> b(b_values, b_rowptr,
                                                      b_colind, b_shape, b_nnz);
-
-      thrust::device_vector<offset_t> d_c_rowptr(m + 1);
+      std::vector<offset_t> c_rowptr(m + 1);
+      thrust::device_vector<offset_t> d_c_rowptr(c_rowptr);
 
       spblas::csr_view<value_t, index_t, offset_t> d_c(
           nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0);
@@ -130,8 +131,10 @@ TEST(thrust_CsrView, SpGEMM_AScaled) {
       spblas::spgemm_state_t state;
       spblas::multiply_compute(state, spblas::scaled(alpha, d_a), d_b, d_c);
       auto nnz = state.result_nnz();
-      thrust::device_vector<value_t> d_c_values(nnz);
-      thrust::device_vector<index_t> d_c_colind(nnz);
+      std::vector<value_t> c_values(nnz);
+      std::vector<index_t> c_colind(nnz);
+      thrust::device_vector<value_t> d_c_values(c_values);
+      thrust::device_vector<index_t> d_c_colind(c_colind);
       std::span<value_t> d_c_values_span(d_c_values.data().get(), nnz);
       std::span<offset_t> d_c_rowptr_span(d_c_rowptr.data().get(), m + 1);
       std::span<index_t> d_c_colind_span(d_c_colind.data().get(), nnz);
@@ -140,9 +143,6 @@ TEST(thrust_CsrView, SpGEMM_AScaled) {
 
       spblas::multiply_fill(state, spblas::scaled(alpha, d_a), d_b, d_c);
 
-      std::vector<value_t> c_values(nnz);
-      std::vector<offset_t> c_rowptr(m + 1);
-      std::vector<index_t> c_colind(nnz);
       thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin());
       thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin());
       thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin());
@@ -210,7 +210,8 @@ TEST(thrust_CsrView, SpGEMM_BScaled) {
       spblas::csr_view<value_t, index_t, offset_t> b(b_values, b_rowptr,
                                                      b_colind, b_shape, b_nnz);
 
-      thrust::device_vector<offset_t> d_c_rowptr(m + 1);
+      std::vector<offset_t> c_rowptr(m + 1);
+      thrust::device_vector<offset_t> d_c_rowptr(c_rowptr);
 
       spblas::csr_view<value_t, index_t, offset_t> d_c(
           nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0);
@@ -218,8 +219,10 @@ TEST(thrust_CsrView, SpGEMM_BScaled) {
       spblas::spgemm_state_t state;
       spblas::multiply_compute(state, d_a, spblas::scaled(alpha, d_b), d_c);
       auto nnz = state.result_nnz();
-      thrust::device_vector<value_t> d_c_values(nnz);
-      thrust::device_vector<index_t> d_c_colind(nnz);
+      std::vector<value_t> c_values(nnz);
+      std::vector<index_t> c_colind(nnz);
+      thrust::device_vector<value_t> d_c_values(c_values);
+      thrust::device_vector<index_t> d_c_colind(c_colind);
       std::span<value_t> d_c_values_span(d_c_values.data().get(), nnz);
       std::span<offset_t> d_c_rowptr_span(d_c_rowptr.data().get(), m + 1);
       std::span<index_t> d_c_colind_span(d_c_colind.data().get(), nnz);
@@ -228,9 +231,6 @@ TEST(thrust_CsrView, SpGEMM_BScaled) {
 
       spblas::multiply_fill(state, d_a, spblas::scaled(alpha, d_b), d_c);
 
-      std::vector<value_t> c_values(nnz);
-      std::vector<offset_t> c_rowptr(m + 1);
-      std::vector<index_t> c_colind(nnz);
       thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin());
       thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin());
       thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin());

From b731ce7e42f47481083f86680093ea7c32a6851d Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 5 Jun 2025 17:25:54 +0200
Subject: [PATCH 12/14] use the helper function to create descriptor

---
 include/spblas/vendor/cusparse/descriptor.hpp | 45 +++++++++++++++++++
 .../vendor/cusparse/multiply_spgemm.hpp       | 34 +++++---------
 2 files changed, 55 insertions(+), 24 deletions(-)
 create mode 100644 include/spblas/vendor/cusparse/descriptor.hpp

diff --git a/include/spblas/vendor/cusparse/descriptor.hpp b/include/spblas/vendor/cusparse/descriptor.hpp
new file mode 100644
index 0000000..e3ed2a1
--- /dev/null
+++ b/include/spblas/vendor/cusparse/descriptor.hpp
@@ -0,0 +1,45 @@
+#pragma once
+
+#include <type_traits>
+
+#include <cusparse.h>
+
+#include <spblas/concepts.hpp>
+#include <spblas/views/inspectors.hpp>
+
+#include "exception.hpp"
+#include "types.hpp"
+
+namespace spblas {
+namespace __cusparse {
+
+// create matrix descriptor from spblas csr view
+template <matrix mat>
+  requires __detail::is_csr_view_v<mat>
+cusparseSpMatDescr_t create_matrix_descr(mat&& a) {
+  using matrix_type = std::remove_cvref_t<mat>;
+  cusparseSpMatDescr_t descr;
+  throw_if_error(cusparseCreateCsr(
+      &descr, __backend::shape(a)[0], __backend::shape(a)[1], a.values().size(),
+      a.rowptr().data(), a.colind().data(), a.values().data(),
+      to_cusparse_indextype<typename matrix_type::offset_type>(),
+      to_cusparse_indextype<typename matrix_type::index_type>(),
+      CUSPARSE_INDEX_BASE_ZERO,
+      to_cuda_datatype<typename matrix_type::scalar_type>()));
+  return descr;
+}
+
+// create dense vector from mdspan
+template <vector vec>
+  requires __ranges::contiguous_range<vec>
+cusparseDnVecDescr_t create_vector_descr(vec&& v) {
+  using vector_type = std::remove_cvref_t<vec>;
+  cusparseDnVecDescr_t descr;
+  throw_if_error(cusparseCreateDnVec(
+      &descr, v.size(), v.data(),
+      to_cuda_datatype<typename vector_type::value_type>()));
+  return descr;
+}
+
+} // namespace __cusparse
+} // namespace spblas
diff --git a/include/spblas/vendor/cusparse/multiply_spgemm.hpp b/include/spblas/vendor/cusparse/multiply_spgemm.hpp
index 8b2e181..f49b1d5 100644
--- a/include/spblas/vendor/cusparse/multiply_spgemm.hpp
+++ b/include/spblas/vendor/cusparse/multiply_spgemm.hpp
@@ -11,6 +11,7 @@
 #include <spblas/detail/view_inspectors.hpp>
 
 #include "cuda_allocator.hpp"
+#include "descriptor.hpp"
 #include "exception.hpp"
 #include "types.hpp"
 
@@ -78,27 +79,12 @@ class spgemm_state_t {
     value_type alpha = alpha_optional.value_or(1);
     value_type beta = 1;
     auto handle = this->handle_.get();
-    // Create sparse matrix A in CSR format
-    __cusparse::throw_if_error(cusparseCreateCsr(
-        &mat_a_, __backend::shape(a_base)[0], __backend::shape(a_base)[1],
-        a_base.values().size(), a_base.rowptr().data(), a_base.colind().data(),
-        a_base.values().data(),
-        to_cusparse_indextype<typename matrix_type::offset_type>(),
-        to_cusparse_indextype<typename matrix_type::index_type>(),
-        CUSPARSE_INDEX_BASE_ZERO, to_cuda_datatype<value_type>()));
-    __cusparse::throw_if_error(cusparseCreateCsr(
-        &mat_b_, __backend::shape(b_base)[0], __backend::shape(b_base)[1],
-        b_base.values().size(), b_base.rowptr().data(), b_base.colind().data(),
-        b_base.values().data(),
-        to_cusparse_indextype<typename matrix_type::offset_type>(),
-        to_cusparse_indextype<typename matrix_type::index_type>(),
-        CUSPARSE_INDEX_BASE_ZERO, to_cuda_datatype<value_type>()));
-    __cusparse::throw_if_error(cusparseCreateCsr(
-        &mat_c_, __backend::shape(a)[0], __backend::shape(b)[1], 0,
-        c.rowptr().data(), NULL, NULL,
-        to_cusparse_indextype<typename matrix_type::offset_type>(),
-        to_cusparse_indextype<typename matrix_type::index_type>(),
-        CUSPARSE_INDEX_BASE_ZERO, to_cuda_datatype<value_type>()));
+    __cusparse::throw_if_error(cusparseDestroySpMat(mat_a_));
+    __cusparse::throw_if_error(cusparseDestroySpMat(mat_b_));
+    __cusparse::throw_if_error(cusparseDestroySpMat(mat_c_));
+    mat_a_ = __cusparse::create_matrix_descr(a_base);
+    mat_b_ = __cusparse::create_matrix_descr(b_base);
+    mat_c_ = __cusparse::create_matrix_descr(c);
 
     // ask bufferSize1 bytes for external memory
     size_t buffer_size_1 = 0;
@@ -183,9 +169,9 @@ class spgemm_state_t {
   char* workspace_2_;
   index<index_t> result_shape_;
   index_t result_nnz_;
-  cusparseSpMatDescr_t mat_a_;
-  cusparseSpMatDescr_t mat_b_;
-  cusparseSpMatDescr_t mat_c_;
+  cusparseSpMatDescr_t mat_a_ = nullptr;
+  cusparseSpMatDescr_t mat_b_ = nullptr;
+  cusparseSpMatDescr_t mat_c_ = nullptr;
   cusparseSpGEMMDescr_t descr_;
 };
 

From 144aa14db682a63255f70ee8b91d57a4b6fcf0eb Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 5 Jun 2025 18:23:33 +0200
Subject: [PATCH 13/14] create cusparse spgemm reuse

---
 .../vendor/cusparse/multiply_spgemm.hpp       | 186 +++++++++++++++++-
 test/gtest/CMakeLists.txt                     |   2 +-
 test/gtest/device/spgemm_reuse_test.cpp       |  70 +++----
 3 files changed, 220 insertions(+), 38 deletions(-)

diff --git a/include/spblas/vendor/cusparse/multiply_spgemm.hpp b/include/spblas/vendor/cusparse/multiply_spgemm.hpp
index f49b1d5..e5fa1ac 100644
--- a/include/spblas/vendor/cusparse/multiply_spgemm.hpp
+++ b/include/spblas/vendor/cusparse/multiply_spgemm.hpp
@@ -22,9 +22,10 @@ class spgemm_state_t {
   spgemm_state_t() : spgemm_state_t(cusparse::cuda_allocator<char>{}) {}
 
   spgemm_state_t(cusparse::cuda_allocator<char> alloc)
-      : alloc_(alloc), buffer_size_1_(0), buffer_size_2_(0),
-        workspace_1_(nullptr), workspace_2_(nullptr), result_nnz_(0),
-        result_shape_(0, 0) {
+      : alloc_(alloc), buffer_size_1_(0), buffer_size_2_(0), buffer_size_3_(0),
+        buffer_size_4_(0), buffer_size_5_(0), workspace_1_(nullptr),
+        workspace_2_(nullptr), workspace_3_(nullptr), workspace_4_(nullptr),
+        workspace_5_(nullptr), result_nnz_(0), result_shape_(0, 0) {
     cusparseHandle_t handle;
     __cusparse::throw_if_error(cusparseCreate(&handle));
     if (auto stream = alloc.stream()) {
@@ -157,6 +158,156 @@ class spgemm_state_t {
         to_cuda_datatype<value_type>(), CUSPARSE_SPGEMM_DEFAULT, this->descr_));
   }
 
+  template <matrix A, matrix B, matrix C>
+    requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+             __detail::is_csr_view_v<C>
+  void multiply_symbolic_compute(A&& a, B&& b, C&& c) {
+    auto a_base = __detail::get_ultimate_base(a);
+    auto b_base = __detail::get_ultimate_base(b);
+    using matrix_type = decltype(a_base);
+    using input_type = decltype(b_base);
+    using output_type = std::remove_reference_t<decltype(c)>;
+    using value_type = typename matrix_type::scalar_type;
+    size_t buffer_size = 0;
+
+    auto alpha_optional = __detail::get_scaling_factor(a, b);
+    value_type alpha = alpha_optional.value_or(1);
+    value_type beta = 1;
+    auto handle = this->handle_.get();
+    __cusparse::throw_if_error(cusparseDestroySpMat(mat_a_));
+    __cusparse::throw_if_error(cusparseDestroySpMat(mat_b_));
+    __cusparse::throw_if_error(cusparseDestroySpMat(mat_c_));
+    mat_a_ = __cusparse::create_matrix_descr(a_base);
+    mat_b_ = __cusparse::create_matrix_descr(b_base);
+    mat_c_ = __cusparse::create_matrix_descr(c);
+
+    // ask bufferSize1 bytes for external memory
+    size_t buffer_size_1 = 0;
+    __cusparse::throw_if_error(cusparseSpGEMMreuse_workEstimation(
+        handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+        CUSPARSE_OPERATION_NON_TRANSPOSE, mat_a_, mat_b_, mat_c_,
+        CUSPARSE_SPGEMM_DEFAULT, this->descr_, &buffer_size_1, NULL));
+    if (buffer_size_1 > this->buffer_size_1_) {
+      this->alloc_.deallocate(this->workspace_1_, buffer_size_1_);
+      this->buffer_size_1_ = buffer_size_1;
+      this->workspace_1_ = this->alloc_.allocate(buffer_size_1);
+    }
+    // inspect the matrices A and B to understand the memory requirement for
+    // the next step
+    __cusparse::throw_if_error(cusparseSpGEMMreuse_workEstimation(
+        handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+        CUSPARSE_OPERATION_NON_TRANSPOSE, mat_a_, mat_b_, mat_c_,
+        CUSPARSE_SPGEMM_DEFAULT, this->descr_, &buffer_size_1,
+        this->workspace_1_));
+
+    // ask buffer_size_2/3/4 bytes for external memory
+    size_t buffer_size_2 = 0;
+    size_t buffer_size_3 = 0;
+    size_t buffer_size_4 = 0;
+    cusparseSpGEMMreuse_nnz(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                            CUSPARSE_OPERATION_NON_TRANSPOSE, mat_a_, mat_b_,
+                            mat_c_, CUSPARSE_SPGEMM_DEFAULT, this->descr_,
+                            &buffer_size_2, NULL, &buffer_size_3, NULL,
+                            &buffer_size_4, NULL);
+    if (buffer_size_2 > this->buffer_size_2_) {
+      this->alloc_.deallocate(this->workspace_2_, buffer_size_2_);
+      this->buffer_size_2_ = buffer_size_2;
+      this->workspace_2_ = this->alloc_.allocate(buffer_size_2);
+    }
+    if (buffer_size_3 > this->buffer_size_3_) {
+      this->alloc_.deallocate(this->workspace_3_, buffer_size_3_);
+      this->buffer_size_3_ = buffer_size_3;
+      this->workspace_3_ = this->alloc_.allocate(buffer_size_3);
+    }
+    if (buffer_size_4 > this->buffer_size_4_) {
+      this->alloc_.deallocate(this->workspace_4_, buffer_size_4_);
+      this->buffer_size_4_ = buffer_size_4;
+      this->workspace_4_ = this->alloc_.allocate(buffer_size_4);
+    }
+
+    // compute nnz
+    cusparseSpGEMMreuse_nnz(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                            CUSPARSE_OPERATION_NON_TRANSPOSE, mat_a_, mat_b_,
+                            mat_c_, CUSPARSE_SPGEMM_DEFAULT, this->descr_,
+                            &buffer_size_2, this->workspace_2_, &buffer_size_3,
+                            this->workspace_3_, &buffer_size_4,
+                            this->workspace_4_);
+    // get matrix C non-zero entries c_nnz
+    int64_t c_num_rows, c_num_cols, c_nnz;
+    cusparseSpMatGetSize(mat_c_, &c_num_rows, &c_num_cols, &c_nnz);
+    this->result_nnz_ = c_nnz;
+    this->result_shape_ = index<index_t>(c_num_rows, c_num_cols);
+  }
+
+  template <matrix A, matrix B, matrix C>
+    requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+             __detail::is_csr_view_v<C>
+  void multiply_symbolic_fill(A&& a, B&& b, C&& c) {
+    auto a_base = __detail::get_ultimate_base(a);
+    auto b_base = __detail::get_ultimate_base(b);
+    using matrix_type = decltype(a_base);
+    using input_type = decltype(b_base);
+    using output_type = std::remove_reference_t<decltype(c)>;
+    using value_type = typename matrix_type::scalar_type;
+
+    auto alpha_optional = __detail::get_scaling_factor(a, b);
+    value_type alpha = alpha_optional.value_or(1);
+    value_type beta = 0;
+
+    __cusparse::throw_if_error(cusparseCsrSetPointers(
+        this->mat_c_, c.rowptr().data(), c.colind().data(), c.values().data()));
+
+    auto handle = this->handle_.get();
+    size_t buffer_size_5 = 0;
+    cusparseSpGEMMreuse_copy(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                             CUSPARSE_OPERATION_NON_TRANSPOSE, mat_a_, mat_b_,
+                             mat_c_, CUSPARSE_SPGEMM_DEFAULT, this->descr_,
+                             &buffer_size_5, NULL);
+    if (buffer_size_5 > this->buffer_size_5_) {
+      this->alloc_.deallocate(this->workspace_5_, buffer_size_5_);
+      this->buffer_size_5_ = buffer_size_5;
+      this->workspace_5_ = this->alloc_.allocate(buffer_size_5);
+    }
+    cusparseSpGEMMreuse_copy(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                             CUSPARSE_OPERATION_NON_TRANSPOSE, mat_a_, mat_b_,
+                             mat_c_, CUSPARSE_SPGEMM_DEFAULT, this->descr_,
+                             &buffer_size_5, this->workspace_5_);
+  }
+
+  template <matrix A, matrix B, matrix C>
+    requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+             __detail::is_csr_view_v<C>
+  void multiply_numeric(A&& a, B&& b, C&& c) {
+    auto a_base = __detail::get_ultimate_base(a);
+    auto b_base = __detail::get_ultimate_base(b);
+    using matrix_type = decltype(a_base);
+    using input_type = decltype(b_base);
+    using output_type = std::remove_reference_t<decltype(c)>;
+    using value_type = typename matrix_type::scalar_type;
+
+    auto alpha_optional = __detail::get_scaling_factor(a, b);
+    tensor_scalar_t<A> alpha = alpha_optional.value_or(1);
+    value_type alpha_val = alpha;
+    value_type beta = 0;
+
+    auto handle = this->handle_.get();
+
+    // Update the pointer from the matrix but they must contains the same
+    // sparsity as the previous call.
+    __cusparse::throw_if_error(
+        cusparseCsrSetPointers(this->mat_a_, a_base.rowptr().data(),
+                               a_base.colind().data(), a_base.values().data()));
+    __cusparse::throw_if_error(
+        cusparseCsrSetPointers(this->mat_b_, b_base.rowptr().data(),
+                               b_base.colind().data(), b_base.values().data()));
+    __cusparse::throw_if_error(cusparseCsrSetPointers(
+        this->mat_c_, c.rowptr().data(), c.colind().data(), c.values().data()));
+    cusparseSpGEMMreuse_compute(
+        handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+        CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_, mat_b_, &beta, mat_c_,
+        to_cuda_datatype<value_type>(), CUSPARSE_SPGEMM_DEFAULT, this->descr_);
+  }
+
 private:
   using handle_manager =
       std::unique_ptr<std::pointer_traits<cusparseHandle_t>::element_type,
@@ -165,8 +316,14 @@ class spgemm_state_t {
   cusparse::cuda_allocator<char> alloc_;
   size_t buffer_size_1_;
   size_t buffer_size_2_;
+  size_t buffer_size_3_;
+  size_t buffer_size_4_;
+  size_t buffer_size_5_;
   char* workspace_1_;
   char* workspace_2_;
+  char* workspace_3_;
+  char* workspace_4_;
+  char* workspace_5_;
   index<index_t> result_shape_;
   index_t result_nnz_;
   cusparseSpMatDescr_t mat_a_ = nullptr;
@@ -194,4 +351,27 @@ void multiply_fill(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c) {
   spgemm_handle.multiply_fill(a, b, c);
 }
 
+template <matrix A, matrix B, matrix C>
+  requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+           __detail::is_csr_view_v<C>
+void multiply_symbolic_compute(spgemm_state_t& spgemm_handle, A&& a, B&& b,
+                               C&& c) {
+  spgemm_handle.multiply_symbolic_compute(a, b, c);
+}
+
+template <matrix A, matrix B, matrix C>
+  requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+           __detail::is_csr_view_v<C>
+void multiply_symbolic_fill(spgemm_state_t& spgemm_handle, A&& a, B&& b,
+                            C&& c) {
+  spgemm_handle.multiply_symbolic_fill(a, b, c);
+}
+
+template <matrix A, matrix B, matrix C>
+  requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+           __detail::is_csr_view_v<C>
+void multiply_numeric(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c) {
+  spgemm_handle.multiply_numeric(a, b, c);
+}
+
 } // namespace spblas
diff --git a/test/gtest/CMakeLists.txt b/test/gtest/CMakeLists.txt
index 4a468da..884f549 100644
--- a/test/gtest/CMakeLists.txt
+++ b/test/gtest/CMakeLists.txt
@@ -20,7 +20,7 @@ if (SPBLAS_GPU_BACKEND)
     set(GPUTEST_SOURCES device/spmv_test.cpp device/spgemm_test.cpp device/spgemm_reuse_test.cpp device/rocsparse/spgemm_4args_test.cpp)
     set_source_files_properties(${GPUTEST_SOURCES} PROPERTIES LANGUAGE HIP)
   else ()
-    set(GPUTEST_SOURCES device/spmv_test.cpp device/spgemm_test.cpp)
+    set(GPUTEST_SOURCES device/spmv_test.cpp device/spgemm_test.cpp device/spgemm_reuse_test.cpp)
   endif ()
   list(APPEND TEST_SOURCES ${GPUTEST_SOURCES})
 endif()
diff --git a/test/gtest/device/spgemm_reuse_test.cpp b/test/gtest/device/spgemm_reuse_test.cpp
index b798d6e..3a6e339 100644
--- a/test/gtest/device/spgemm_reuse_test.cpp
+++ b/test/gtest/device/spgemm_reuse_test.cpp
@@ -34,7 +34,8 @@ TEST(thrust_CsrView, SpGEMMReuse) {
       spblas::csr_view<value_t, index_t, offset_t> b(b_values, b_rowptr,
                                                      b_colind, b_shape, b_nnz);
 
-      thrust::device_vector<offset_t> d_c_rowptr(m + 1);
+      std::vector<offset_t> c_rowptr(m + 1);
+      thrust::device_vector<offset_t> d_c_rowptr(c_rowptr);
 
       spblas::csr_view<value_t, index_t, offset_t> d_c(
           nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0);
@@ -42,8 +43,10 @@ TEST(thrust_CsrView, SpGEMMReuse) {
       spblas::spgemm_state_t state;
       spblas::multiply_symbolic_compute(state, d_a, d_b, d_c);
       auto nnz = state.result_nnz();
-      thrust::device_vector<value_t> d_c_values(nnz);
-      thrust::device_vector<index_t> d_c_colind(nnz);
+      std::vector<value_t> c_values(nnz);
+      std::vector<index_t> c_colind(nnz);
+      thrust::device_vector<value_t> d_c_values(c_values);
+      thrust::device_vector<index_t> d_c_colind(c_colind);
       std::span<value_t> d_c_values_span(d_c_values.data().get(), nnz);
       std::span<offset_t> d_c_rowptr_span(d_c_rowptr.data().get(), m + 1);
       std::span<index_t> d_c_colind_span(d_c_colind.data().get(), nnz);
@@ -68,9 +71,6 @@ TEST(thrust_CsrView, SpGEMMReuse) {
           thrust::copy(b_values.begin(), b_values.end(), d_b_values.begin());
         }
         spblas::multiply_numeric(state, d_a, d_b, d_c);
-        std::vector<value_t> c_values(nnz);
-        std::vector<offset_t> c_rowptr(m + 1);
-        std::vector<index_t> c_colind(nnz);
         thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin());
         thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin());
         thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin());
@@ -138,8 +138,8 @@ TEST(thrust_CsrView, SpGEMMReuse_AScaled) {
           d_b_colind.data().get(), b_shape, b_nnz);
       spblas::csr_view<value_t, index_t, offset_t> b(b_values, b_rowptr,
                                                      b_colind, b_shape, b_nnz);
-
-      thrust::device_vector<offset_t> d_c_rowptr(m + 1);
+      std::vector<offset_t> c_rowptr(m + 1);
+      thrust::device_vector<offset_t> d_c_rowptr(c_rowptr);
 
       spblas::csr_view<value_t, index_t, offset_t> d_c(
           nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0);
@@ -147,8 +147,10 @@ TEST(thrust_CsrView, SpGEMMReuse_AScaled) {
       spblas::spgemm_state_t state;
       spblas::multiply_symbolic_compute(state, scaled(alpha, d_a), d_b, d_c);
       auto nnz = state.result_nnz();
-      thrust::device_vector<value_t> d_c_values(nnz);
-      thrust::device_vector<index_t> d_c_colind(nnz);
+      std::vector<value_t> c_values(nnz);
+      std::vector<index_t> c_colind(nnz);
+      thrust::device_vector<value_t> d_c_values(c_values);
+      thrust::device_vector<index_t> d_c_colind(c_colind);
       std::span<value_t> d_c_values_span(d_c_values.data().get(), nnz);
       std::span<offset_t> d_c_rowptr_span(d_c_rowptr.data().get(), m + 1);
       std::span<index_t> d_c_colind_span(d_c_colind.data().get(), nnz);
@@ -173,9 +175,6 @@ TEST(thrust_CsrView, SpGEMMReuse_AScaled) {
           thrust::copy(b_values.begin(), b_values.end(), d_b_values.begin());
         }
         spblas::multiply_numeric(state, scaled(alpha, d_a), d_b, d_c);
-        std::vector<value_t> c_values(nnz);
-        std::vector<offset_t> c_rowptr(m + 1);
-        std::vector<index_t> c_colind(nnz);
         thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin());
         thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin());
         thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin());
@@ -243,8 +242,8 @@ TEST(thrust_CsrView, SpGEMMReuse_BScaled) {
           d_b_colind.data().get(), b_shape, b_nnz);
       spblas::csr_view<value_t, index_t, offset_t> b(b_values, b_rowptr,
                                                      b_colind, b_shape, b_nnz);
-
-      thrust::device_vector<offset_t> d_c_rowptr(m + 1);
+      std::vector<offset_t> c_rowptr(m + 1);
+      thrust::device_vector<offset_t> d_c_rowptr(c_rowptr);
 
       spblas::csr_view<value_t, index_t, offset_t> d_c(
           nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0);
@@ -252,8 +251,10 @@ TEST(thrust_CsrView, SpGEMMReuse_BScaled) {
       spblas::spgemm_state_t state;
       spblas::multiply_symbolic_compute(state, d_a, scaled(alpha, d_b), d_c);
       auto nnz = state.result_nnz();
-      thrust::device_vector<value_t> d_c_values(nnz);
-      thrust::device_vector<index_t> d_c_colind(nnz);
+      std::vector<value_t> c_values(nnz);
+      std::vector<index_t> c_colind(nnz);
+      thrust::device_vector<value_t> d_c_values(c_values);
+      thrust::device_vector<index_t> d_c_colind(c_colind);
       std::span<value_t> d_c_values_span(d_c_values.data().get(), nnz);
       std::span<offset_t> d_c_rowptr_span(d_c_rowptr.data().get(), m + 1);
       std::span<index_t> d_c_colind_span(d_c_colind.data().get(), nnz);
@@ -278,9 +279,6 @@ TEST(thrust_CsrView, SpGEMMReuse_BScaled) {
           thrust::copy(b_values.begin(), b_values.end(), d_b_values.begin());
         }
         spblas::multiply_numeric(state, d_a, scaled(alpha, d_b), d_c);
-        std::vector<value_t> c_values(nnz);
-        std::vector<offset_t> c_rowptr(m + 1);
-        std::vector<index_t> c_colind(nnz);
         thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin());
         thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin());
         thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin());
@@ -348,7 +346,8 @@ TEST(thrust_CsrView, SpGEMMReuseAndChangePointer) {
       spblas::csr_view<value_t, index_t, offset_t> b(b_values, b_rowptr,
                                                      b_colind, b_shape, b_nnz);
 
-      thrust::device_vector<offset_t> d_c_rowptr(m + 1);
+      std::vector<offset_t> c_rowptr(m + 1);
+      thrust::device_vector<offset_t> d_c_rowptr(c_rowptr);
 
       spblas::csr_view<value_t, index_t, offset_t> d_c(
           nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0);
@@ -356,8 +355,10 @@ TEST(thrust_CsrView, SpGEMMReuseAndChangePointer) {
       spblas::spgemm_state_t state;
       spblas::multiply_symbolic_compute(state, d_a, d_b, d_c);
       auto nnz = state.result_nnz();
-      thrust::device_vector<value_t> d_c_values(nnz);
-      thrust::device_vector<index_t> d_c_colind(nnz);
+      std::vector<value_t> c_values(nnz);
+      std::vector<index_t> c_colind(nnz);
+      thrust::device_vector<value_t> d_c_values(c_values);
+      thrust::device_vector<index_t> d_c_colind(c_colind);
       std::span<value_t> d_c_values_span(d_c_values.data().get(), nnz);
       std::span<offset_t> d_c_rowptr_span(d_c_rowptr.data().get(), m + 1);
       std::span<index_t> d_c_colind_span(d_c_colind.data().get(), nnz);
@@ -365,6 +366,9 @@ TEST(thrust_CsrView, SpGEMMReuseAndChangePointer) {
                  nnz);
 
       spblas::multiply_symbolic_fill(state, d_a, d_b, d_c);
+      // move the sparsity back to host for later copy
+      thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin());
+      thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin());
       std::mt19937 g(0);
       for (int i = 0; i < 3; i++) {
         // regenerate value of a and b;
@@ -376,16 +380,17 @@ TEST(thrust_CsrView, SpGEMMReuseAndChangePointer) {
           v = val_dist(g);
         }
         // create different pointers than the symbolic phase, but they still
-        // hold the same sparsity
+        // hold the same sparsity.
+        // note. cuda without nvcc can only copy from host to device
         thrust::device_vector<value_t> d_a_values_new(a_values);
-        thrust::device_vector<index_t> d_a_colind_new(d_a_colind);
-        thrust::device_vector<index_t> d_a_rowptr_new(d_a_rowptr);
+        thrust::device_vector<index_t> d_a_colind_new(a_colind);
+        thrust::device_vector<index_t> d_a_rowptr_new(a_rowptr);
         thrust::device_vector<value_t> d_b_values_new(b_values);
-        thrust::device_vector<index_t> d_b_colind_new(d_b_colind);
-        thrust::device_vector<index_t> d_b_rowptr_new(d_b_rowptr);
-        thrust::device_vector<value_t> d_c_values_new(d_c_values);
-        thrust::device_vector<index_t> d_c_colind_new(d_c_colind);
-        thrust::device_vector<index_t> d_c_rowptr_new(d_c_rowptr);
+        thrust::device_vector<index_t> d_b_colind_new(b_colind);
+        thrust::device_vector<index_t> d_b_rowptr_new(b_rowptr);
+        thrust::device_vector<value_t> d_c_values_new(c_values);
+        thrust::device_vector<index_t> d_c_colind_new(c_colind);
+        thrust::device_vector<index_t> d_c_rowptr_new(c_rowptr);
         spblas::csr_view<value_t, index_t, offset_t> d_a(
             d_a_values_new.data().get(), d_a_rowptr_new.data().get(),
             d_a_colind_new.data().get(), a_shape, a_nnz);
@@ -398,9 +403,6 @@ TEST(thrust_CsrView, SpGEMMReuseAndChangePointer) {
         // call numeric on new data
         spblas::multiply_numeric(state, d_a, d_b, d_c);
         // move c back to host memory
-        std::vector<value_t> c_values(nnz);
-        std::vector<offset_t> c_rowptr(m + 1);
-        std::vector<index_t> c_colind(nnz);
         thrust::copy(d_c_values_new.begin(), d_c_values_new.end(),
                      c_values.begin());
         thrust::copy(d_c_rowptr_new.begin(), d_c_rowptr_new.end(),

From 7845296f6a9752e6f532956c868484209f8f23aa Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 21 Aug 2025 16:35:23 +0200
Subject: [PATCH 14/14] reuse the current changes on main

---
 include/spblas/vendor/cusparse/descriptor.hpp | 45 -------------------
 .../vendor/cusparse/multiply_spgemm.hpp       | 42 ++++++++---------
 2 files changed, 22 insertions(+), 65 deletions(-)
 delete mode 100644 include/spblas/vendor/cusparse/descriptor.hpp

diff --git a/include/spblas/vendor/cusparse/descriptor.hpp b/include/spblas/vendor/cusparse/descriptor.hpp
deleted file mode 100644
index e3ed2a1..0000000
--- a/include/spblas/vendor/cusparse/descriptor.hpp
+++ /dev/null
@@ -1,45 +0,0 @@
-#pragma once
-
-#include <type_traits>
-
-#include <cusparse.h>
-
-#include <spblas/concepts.hpp>
-#include <spblas/views/inspectors.hpp>
-
-#include "exception.hpp"
-#include "types.hpp"
-
-namespace spblas {
-namespace __cusparse {
-
-// create matrix descriptor from spblas csr view
-template <matrix mat>
-  requires __detail::is_csr_view_v<mat>
-cusparseSpMatDescr_t create_matrix_descr(mat&& a) {
-  using matrix_type = std::remove_cvref_t<mat>;
-  cusparseSpMatDescr_t descr;
-  throw_if_error(cusparseCreateCsr(
-      &descr, __backend::shape(a)[0], __backend::shape(a)[1], a.values().size(),
-      a.rowptr().data(), a.colind().data(), a.values().data(),
-      to_cusparse_indextype<typename matrix_type::offset_type>(),
-      to_cusparse_indextype<typename matrix_type::index_type>(),
-      CUSPARSE_INDEX_BASE_ZERO,
-      to_cuda_datatype<typename matrix_type::scalar_type>()));
-  return descr;
-}
-
-// create dense vector from mdspan
-template <vector vec>
-  requires __ranges::contiguous_range<vec>
-cusparseDnVecDescr_t create_vector_descr(vec&& v) {
-  using vector_type = std::remove_cvref_t<vec>;
-  cusparseDnVecDescr_t descr;
-  throw_if_error(cusparseCreateDnVec(
-      &descr, v.size(), v.data(),
-      to_cuda_datatype<typename vector_type::value_type>()));
-  return descr;
-}
-
-} // namespace __cusparse
-} // namespace spblas
diff --git a/include/spblas/vendor/cusparse/multiply_spgemm.hpp b/include/spblas/vendor/cusparse/multiply_spgemm.hpp
index e5fa1ac..d8b9f71 100644
--- a/include/spblas/vendor/cusparse/multiply_spgemm.hpp
+++ b/include/spblas/vendor/cusparse/multiply_spgemm.hpp
@@ -11,7 +11,7 @@
 #include <spblas/detail/view_inspectors.hpp>
 
 #include "cuda_allocator.hpp"
-#include "descriptor.hpp"
+#include "detail/cusparse_tensors.hpp"
 #include "exception.hpp"
 #include "types.hpp"
 
@@ -83,17 +83,17 @@ class spgemm_state_t {
     __cusparse::throw_if_error(cusparseDestroySpMat(mat_a_));
     __cusparse::throw_if_error(cusparseDestroySpMat(mat_b_));
     __cusparse::throw_if_error(cusparseDestroySpMat(mat_c_));
-    mat_a_ = __cusparse::create_matrix_descr(a_base);
-    mat_b_ = __cusparse::create_matrix_descr(b_base);
-    mat_c_ = __cusparse::create_matrix_descr(c);
+    mat_a_ = __cusparse::create_cusparse_handle(a_base);
+    mat_b_ = __cusparse::create_cusparse_handle(b_base);
+    mat_c_ = __cusparse::create_cusparse_handle(c);
 
     // ask bufferSize1 bytes for external memory
     size_t buffer_size_1 = 0;
     __cusparse::throw_if_error(cusparseSpGEMM_workEstimation(
         handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
         CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_, mat_b_, &beta, mat_c_,
-        to_cuda_datatype<value_type>(), CUSPARSE_SPGEMM_DEFAULT, this->descr_,
-        &buffer_size_1, NULL));
+        detail::cuda_data_type_v<value_type>, CUSPARSE_SPGEMM_DEFAULT,
+        this->descr_, &buffer_size_1, NULL));
     if (buffer_size_1 > this->buffer_size_1_) {
       this->alloc_.deallocate(this->workspace_1_, buffer_size_1_);
       this->buffer_size_1_ = buffer_size_1;
@@ -104,16 +104,16 @@ class spgemm_state_t {
     __cusparse::throw_if_error(cusparseSpGEMM_workEstimation(
         handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
         CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_, mat_b_, &beta, mat_c_,
-        to_cuda_datatype<value_type>(), CUSPARSE_SPGEMM_DEFAULT, this->descr_,
-        &buffer_size_1, this->workspace_1_));
+        detail::cuda_data_type_v<value_type>, CUSPARSE_SPGEMM_DEFAULT,
+        this->descr_, &buffer_size_1, this->workspace_1_));
 
     // ask buffer_size_2 bytes for external memory
     size_t buffer_size_2 = 0;
     __cusparse::throw_if_error(cusparseSpGEMM_compute(
         handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
         CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_, mat_b_, &beta, mat_c_,
-        to_cuda_datatype<value_type>(), CUSPARSE_SPGEMM_DEFAULT, this->descr_,
-        &buffer_size_2, NULL));
+        detail::cuda_data_type_v<value_type>, CUSPARSE_SPGEMM_DEFAULT,
+        this->descr_, &buffer_size_2, NULL));
     if (buffer_size_2 > this->buffer_size_2_) {
       this->alloc_.deallocate(this->workspace_2_, buffer_size_2_);
       this->buffer_size_2_ = buffer_size_2;
@@ -124,8 +124,8 @@ class spgemm_state_t {
     cusparseSpGEMM_compute(
         handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
         CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_, mat_b_, &beta, mat_c_,
-        to_cuda_datatype<value_type>(), CUSPARSE_SPGEMM_DEFAULT, this->descr_,
-        &buffer_size_2, this->workspace_2_);
+        detail::cuda_data_type_v<value_type>, CUSPARSE_SPGEMM_DEFAULT,
+        this->descr_, &buffer_size_2, this->workspace_2_);
     // get matrix C non-zero entries c_nnz
     int64_t c_num_rows, c_num_cols, c_nnz;
     cusparseSpMatGetSize(mat_c_, &c_num_rows, &c_num_cols, &c_nnz);
@@ -155,7 +155,8 @@ class spgemm_state_t {
     __cusparse::throw_if_error(cusparseSpGEMM_copy(
         handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
         CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_, mat_b_, &beta, mat_c_,
-        to_cuda_datatype<value_type>(), CUSPARSE_SPGEMM_DEFAULT, this->descr_));
+        detail::cuda_data_type_v<value_type>, CUSPARSE_SPGEMM_DEFAULT,
+        this->descr_));
   }
 
   template <matrix A, matrix B, matrix C>
@@ -177,9 +178,9 @@ class spgemm_state_t {
     __cusparse::throw_if_error(cusparseDestroySpMat(mat_a_));
     __cusparse::throw_if_error(cusparseDestroySpMat(mat_b_));
     __cusparse::throw_if_error(cusparseDestroySpMat(mat_c_));
-    mat_a_ = __cusparse::create_matrix_descr(a_base);
-    mat_b_ = __cusparse::create_matrix_descr(b_base);
-    mat_c_ = __cusparse::create_matrix_descr(c);
+    mat_a_ = __cusparse::create_cusparse_handle(a_base);
+    mat_b_ = __cusparse::create_cusparse_handle(b_base);
+    mat_c_ = __cusparse::create_cusparse_handle(c);
 
     // ask bufferSize1 bytes for external memory
     size_t buffer_size_1 = 0;
@@ -302,10 +303,11 @@ class spgemm_state_t {
                                b_base.colind().data(), b_base.values().data()));
     __cusparse::throw_if_error(cusparseCsrSetPointers(
         this->mat_c_, c.rowptr().data(), c.colind().data(), c.values().data()));
-    cusparseSpGEMMreuse_compute(
-        handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-        CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_, mat_b_, &beta, mat_c_,
-        to_cuda_datatype<value_type>(), CUSPARSE_SPGEMM_DEFAULT, this->descr_);
+    cusparseSpGEMMreuse_compute(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha,
+                                mat_a_, mat_b_, &beta, mat_c_,
+                                detail::cuda_data_type_v<value_type>,
+                                CUSPARSE_SPGEMM_DEFAULT, this->descr_);
   }
 
 private: