From 405bb91bff72ede371e781bf74b48e5ad9489575 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 27 Mar 2025 18:25:28 +0100 Subject: [PATCH 01/14] add rocsparse spgemm --- include/spblas/vendor/rocsparse/multiply.hpp | 2 +- .../vendor/rocsparse/multiply_spgemm.hpp | 326 ++++++++++++++++++ include/spblas/vendor/rocsparse/rocsparse.hpp | 1 + 3 files changed, 328 insertions(+), 1 deletion(-) create mode 100644 include/spblas/vendor/rocsparse/multiply_spgemm.hpp diff --git a/include/spblas/vendor/rocsparse/multiply.hpp b/include/spblas/vendor/rocsparse/multiply.hpp index f276fbf..7d88c5c 100644 --- a/include/spblas/vendor/rocsparse/multiply.hpp +++ b/include/spblas/vendor/rocsparse/multiply.hpp @@ -86,7 +86,7 @@ class spmv_state_t { // only allocate the new workspace when the requiring workspace larger than // current if (buffer_size > this->buffer_size_) { - this->alloc_.deallocate(this->workspace_, buffer_size_); + this->alloc_.deallocate(this->workspace_, this->buffer_size_); this->buffer_size_ = buffer_size; this->workspace_ = this->alloc_.allocate(buffer_size); } diff --git a/include/spblas/vendor/rocsparse/multiply_spgemm.hpp b/include/spblas/vendor/rocsparse/multiply_spgemm.hpp new file mode 100644 index 0000000..7f215df --- /dev/null +++ b/include/spblas/vendor/rocsparse/multiply_spgemm.hpp @@ -0,0 +1,326 @@ +#pragma once + +#include +#include +#include + +#include +#include + +#include +#include + +#include "exception.hpp" +#include "hip_allocator.hpp" +#include "types.hpp" + +namespace spblas { +namespace __rocsparse { + +template +T create_null_matrix() { + return {nullptr, nullptr, nullptr, index{0, 0}, 0}; +} + +} // namespace __rocsparse + +class spgemm_state_t { +public: + spgemm_state_t() : spgemm_state_t(rocsparse::hip_allocator{}) {} + + spgemm_state_t(rocsparse::hip_allocator alloc) + : alloc_(alloc), buffer_size_(0), workspace_(nullptr), result_nnz_(0), + result_shape_(0, 0) { + rocsparse_handle handle; + __rocsparse::throw_if_error(rocsparse_create_handle(&handle)); + if (auto stream = alloc.stream()) { + rocsparse_set_stream(handle, stream); + } + handle_ = handle_manager(handle, [](rocsparse_handle handle) { + __rocsparse::throw_if_error(rocsparse_destroy_handle(handle)); + }); + } + + spgemm_state_t(rocsparse::hip_allocator alloc, rocsparse_handle handle) + : alloc_(alloc), buffer_size_(0), workspace_(nullptr), result_nnz_(0), + result_shape_(0, 0) { + handle_ = handle_manager(handle, [](rocsparse_handle handle) { + // it is provided by user, we do not delete it at all. + }); + } + + ~spgemm_state_t() { + alloc_.deallocate(this->workspace_, this->buffer_size_); + __rocsparse::throw_if_error(rocsparse_destroy_spmat_descr(this->mat_a_)); + __rocsparse::throw_if_error(rocsparse_destroy_spmat_descr(this->mat_b_)); + __rocsparse::throw_if_error(rocsparse_destroy_spmat_descr(this->mat_c_)); + __rocsparse::throw_if_error(rocsparse_destroy_spmat_descr(this->mat_d_)); + } + + auto result_shape() { + return this->result_shape_; + } + + auto result_nnz() { + return this->result_nnz_; + } + + template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v && __detail::has_csr_base + void multiply_compute(A&& a, B&& b, C&& c, D&& d) { + auto a_base = __detail::get_ultimate_base(a); + auto b_base = __detail::get_ultimate_base(b); + auto d_base = __detail::get_ultimate_base(d); + using matrix_type = decltype(a_base); + using input_type = decltype(b_base); + using output_type = std::remove_reference_t; + using value_type = typename matrix_type::scalar_type; + + size_t buffer_size = 0; + + auto alpha_optional = __detail::get_scaling_factor(a, b); + value_type alpha = alpha_optional.value_or(1); + auto beta_optional = __detail::get_scaling_factor(d); + value_type beta = beta_optional.value_or(1); + auto handle = this->handle_.get(); + // Create sparse matrix A in CSR format + __rocsparse::throw_if_error(rocsparse_create_csr_descr( + &this->mat_a_, __backend::shape(a_base)[0], __backend::shape(a_base)[1], + a_base.values().size(), a_base.rowptr().data(), a_base.colind().data(), + a_base.values().data(), + to_rocsparse_indextype(), + to_rocsparse_indextype(), + rocsparse_index_base_zero, to_rocsparse_datatype())); + __rocsparse::throw_if_error(rocsparse_create_csr_descr( + &this->mat_b_, __backend::shape(b_base)[0], __backend::shape(b_base)[1], + b_base.values().size(), b_base.rowptr().data(), b_base.colind().data(), + b_base.values().data(), + to_rocsparse_indextype(), + to_rocsparse_indextype(), + rocsparse_index_base_zero, + to_rocsparse_datatype())); + __rocsparse::throw_if_error(rocsparse_create_csr_descr( + &this->mat_c_, __backend::shape(a_base)[0], __backend::shape(b_base)[1], + 0, c.rowptr().data(), NULL, NULL, + to_rocsparse_indextype(), + to_rocsparse_indextype(), + rocsparse_index_base_zero, + to_rocsparse_datatype())); + __rocsparse::throw_if_error(rocsparse_create_csr_descr( + &this->mat_d_, __backend::shape(d_base)[0], __backend::shape(d_base)[1], + d_base.values().size(), d_base.rowptr().data(), d_base.colind().data(), + d_base.values().data(), + to_rocsparse_indextype(), + to_rocsparse_indextype(), + rocsparse_index_base_zero, + to_rocsparse_datatype())); + //-------------------------------------------------------------------------- + // SpGEMM Computation + // ask buffer_size bytes for external memory + __rocsparse::throw_if_error(rocsparse_spgemm( + handle, rocsparse_operation_none, rocsparse_operation_none, &alpha, + this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_, + to_rocsparse_datatype(), rocsparse_spgemm_alg_default, + rocsparse_spgemm_stage_buffer_size, &buffer_size, nullptr)); + if (buffer_size > this->buffer_size_) { + this->alloc_.deallocate(workspace_, this->buffer_size_); + this->buffer_size_ = buffer_size; + workspace_ = this->alloc_.allocate(buffer_size); + } + __rocsparse::throw_if_error(rocsparse_spgemm( + handle, rocsparse_operation_none, rocsparse_operation_none, &alpha, + this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_, + to_rocsparse_datatype(), rocsparse_spgemm_alg_default, + rocsparse_spgemm_stage_nnz, &this->buffer_size_, this->workspace_)); + // get matrix C non-zero entries and size + int64_t c_num_rows; + int64_t c_num_cols; + __rocsparse::throw_if_error(rocsparse_spmat_get_size( + this->mat_c_, &c_num_rows, &c_num_cols, &this->result_nnz_)); + this->result_shape_ = index(c_num_rows, c_num_cols); + } + + template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v && __detail::has_csr_base + void multiply_fill(A&& a, B&& b, C&& c, D&& d) { + auto a_base = __detail::get_ultimate_base(a); + auto b_base = __detail::get_ultimate_base(b); + using matrix_type = decltype(a_base); + using input_type = decltype(b_base); + using output_type = std::remove_reference_t; + using value_type = typename matrix_type::scalar_type; + + auto alpha_optional = __detail::get_scaling_factor(a, b); + tensor_scalar_t alpha = alpha_optional.value_or(1); + value_type alpha_val = alpha; + auto beta_optional = __detail::get_scaling_factor(d); + value_type beta = beta_optional.value_or(1); + + __rocsparse::throw_if_error(rocsparse_csr_set_pointers( + this->mat_c_, c.rowptr().data(), c.colind().data(), c.values().data())); + + __rocsparse::throw_if_error(rocsparse_spgemm( + handle_.get(), rocsparse_operation_none, rocsparse_operation_none, + &alpha, this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_, + to_rocsparse_datatype(), rocsparse_spgemm_alg_default, + rocsparse_spgemm_stage_compute, &this->buffer_size_, workspace_)); + } + + template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v && __detail::has_csr_base + void multiply_symbolic_fill(A&& a, B&& b, C&& c, D&& d) { + auto a_base = __detail::get_ultimate_base(a); + auto b_base = __detail::get_ultimate_base(b); + auto d_base = __detail::get_ultimate_base(d); + using matrix_type = decltype(a_base); + using input_type = decltype(b_base); + using output_type = std::remove_reference_t; + using value_type = typename matrix_type::scalar_type; + + auto alpha_optional = __detail::get_scaling_factor(a, b); + value_type alpha = alpha_optional.value_or(1); + auto beta_optional = __detail::get_scaling_factor(d); + value_type beta = beta_optional.value_or(1); + + __rocsparse::throw_if_error(rocsparse_csr_set_pointers( + this->mat_c_, c.rowptr().data(), c.colind().data(), c.values().data())); + + __rocsparse::throw_if_error(rocsparse_spgemm( + this->handle_.get(), rocsparse_operation_none, rocsparse_operation_none, + &alpha, this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_, + to_rocsparse_datatype(), rocsparse_spgemm_alg_default, + rocsparse_spgemm_stage_symbolic, &this->buffer_size_, + this->workspace_)); + } + + template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v && __detail::has_csr_base + void multiply_numeric(A&& a, B&& b, C&& c, D&& d) { + auto a_base = __detail::get_ultimate_base(a); + auto b_base = __detail::get_ultimate_base(b); + using matrix_type = decltype(a_base); + using input_type = decltype(b_base); + using output_type = std::remove_reference_t; + using value_type = typename matrix_type::scalar_type; + + auto alpha_optional = __detail::get_scaling_factor(a, b); + tensor_scalar_t alpha = alpha_optional.value_or(1); + value_type alpha_val = alpha; + auto beta_optional = __detail::get_scaling_factor(d); + value_type beta = beta_optional.value_or(1); + + __rocsparse::throw_if_error(rocsparse_spgemm( + this->handle_.get(), rocsparse_operation_none, rocsparse_operation_none, + &alpha, this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_, + to_rocsparse_datatype(), rocsparse_spgemm_alg_default, + rocsparse_spgemm_stage_numeric, &this->buffer_size_, this->workspace_)); + } + +private: + using handle_manager = + std::unique_ptr::element_type, + std::function>; + handle_manager handle_; + rocsparse::hip_allocator alloc_; + long unsigned int buffer_size_; + char* workspace_; + index result_shape_; + index_t result_nnz_; + rocsparse_spmat_descr mat_a_; + rocsparse_spmat_descr mat_b_; + rocsparse_spmat_descr mat_c_; + rocsparse_spmat_descr mat_d_; +}; + +template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v +void multiply_inspect(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c) {} + +template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v && __detail::has_csr_base +void multiply_compute(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c, + D&& d) { + spgemm_handle.multiply_compute(a, b, c, d); +} + +template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v && __detail::has_csr_base +void multiply_fill(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c, D&& d) { + spgemm_handle.multiply_fill(a, b, c, d); +} + +template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v && __detail::has_csr_base +void multiply_symbolic_compute(spgemm_state_t& spgemm_handle, A&& a, B&& b, + C&& c, D&& d) { + spgemm_handle.multiply_compute(a, b, c, d); +} + +template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v && __detail::has_csr_base +void multiply_symbolic_fill(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c, + D&& d) { + spgemm_handle.multiply_symbolic_fill(a, b, c, d); +} + +template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v && __detail::has_csr_base +void multiply_numeric(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c, + D&& d) { + spgemm_handle.multiply_numeric(a, b, c, d); +} + +// the followings support C = A*B by giving null D matrix. +template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v +void multiply_compute(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c) { + auto d = __rocsparse::create_null_matrix>(); + spgemm_handle.multiply_compute(a, b, c, scaled(0.0, d)); +} + +template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v +void multiply_fill(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c) { + auto d = __rocsparse::create_null_matrix>(); + spgemm_handle.multiply_fill(a, b, c, scaled(0.0, d)); +} + +template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v +void multiply_symbolic_compute(spgemm_state_t& spgemm_handle, A&& a, B&& b, + C&& c) { + auto d = __rocsparse::create_null_matrix>(); + spgemm_handle.multiply_compute(a, b, c, scaled(0.0, d)); +} + +template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v +void multiply_symbolic_fill(spgemm_state_t& spgemm_handle, A&& a, B&& b, + C&& c) { + auto d = __rocsparse::create_null_matrix>(); + spgemm_handle.multiply_symbolic_fill(a, b, c, scaled(0.0, d)); +} + +template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v +void multiply_numeric(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c) { + auto d = __rocsparse::create_null_matrix>(); + spgemm_handle.multiply_numeric(a, b, c, scaled(0.0, d)); +} + +} // namespace spblas diff --git a/include/spblas/vendor/rocsparse/rocsparse.hpp b/include/spblas/vendor/rocsparse/rocsparse.hpp index 7caa698..014b2ba 100644 --- a/include/spblas/vendor/rocsparse/rocsparse.hpp +++ b/include/spblas/vendor/rocsparse/rocsparse.hpp @@ -1,3 +1,4 @@ #pragma once #include "multiply.hpp" +#include "multiply_spgemm.hpp" From 9263ec1978212f5437695f53f8787a76ecf18d70 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 27 Mar 2025 18:26:42 +0100 Subject: [PATCH 02/14] add rocsparse spgemm with 3 args test --- include/spblas/vendor/rocsparse/exception.hpp | 4 +- test/gtest/CMakeLists.txt | 6 +- test/gtest/device/spgemm_test.cpp | 273 ++++++++++++++++++ 3 files changed, 280 insertions(+), 3 deletions(-) create mode 100644 test/gtest/device/spgemm_test.cpp diff --git a/include/spblas/vendor/rocsparse/exception.hpp b/include/spblas/vendor/rocsparse/exception.hpp index 6a2ed5c..cb836b5 100644 --- a/include/spblas/vendor/rocsparse/exception.hpp +++ b/include/spblas/vendor/rocsparse/exception.hpp @@ -10,7 +10,7 @@ namespace spblas { namespace __rocsparse { // Throw an exception if the hipError_t is not hipSuccess. -void throw_if_error(hipError_t error_code, std::string prefix = "") { +inline void throw_if_error(hipError_t error_code, std::string prefix = "") { if (error_code == hipSuccess) { return; } @@ -21,7 +21,7 @@ void throw_if_error(hipError_t error_code, std::string prefix = "") { } // Throw an exception if the rocsparse_status is not rocsparse_status_success. -void throw_if_error(rocsparse_status error_code) { +inline void throw_if_error(rocsparse_status error_code) { if (error_code == rocsparse_status_success) { return; } else if (error_code == rocsparse_status_invalid_handle) { diff --git a/test/gtest/CMakeLists.txt b/test/gtest/CMakeLists.txt index 97c54d2..2ac50d3 100644 --- a/test/gtest/CMakeLists.txt +++ b/test/gtest/CMakeLists.txt @@ -22,7 +22,11 @@ if (NOT SPBLAS_GPU_BACKEND) triangular_solve_test.cpp ) else() - set(TEST_SOURCES device/spmv_test.cpp) + if(ENABLE_ROCSPARSE) + set(TEST_SOURCES device/spmv_test.cpp device/spgemm_test.cpp) + else() + set(TEST_SOURCES device/spmv_test.cpp) + endif() add_device_test(TEST_SOURCES) endif() diff --git a/test/gtest/device/spgemm_test.cpp b/test/gtest/device/spgemm_test.cpp new file mode 100644 index 0000000..4662ffd --- /dev/null +++ b/test/gtest/device/spgemm_test.cpp @@ -0,0 +1,273 @@ + +#include "../util.hpp" +#include + +#include +#include + +using value_t = float; +using index_t = spblas::index_t; +using offset_t = spblas::offset_t; + +TEST(CsrView, SpGEMM) { + for (auto&& [m, k, nnz] : util::dims) { + for (auto&& n : {m, k}) { + auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] = + spblas::generate_csr(m, k, nnz); + thrust::device_vector d_a_values(a_values); + thrust::device_vector d_a_rowptr(a_rowptr); + thrust::device_vector d_a_colind(a_colind); + spblas::csr_view d_a( + d_a_values.data().get(), d_a_rowptr.data().get(), + d_a_colind.data().get(), a_shape, a_nnz); + spblas::csr_view a(a_values, a_rowptr, + a_colind, a_shape, a_nnz); + + auto [b_values, b_rowptr, b_colind, b_shape, b_nnz] = + spblas::generate_csr(k, n, nnz); + thrust::device_vector d_b_values(b_values); + thrust::device_vector d_b_rowptr(b_rowptr); + thrust::device_vector d_b_colind(b_colind); + spblas::csr_view d_b( + d_b_values.data().get(), d_b_rowptr.data().get(), + d_b_colind.data().get(), b_shape, b_nnz); + spblas::csr_view b(b_values, b_rowptr, + b_colind, b_shape, b_nnz); + + thrust::device_vector d_c_rowptr(m + 1); + + spblas::csr_view d_c( + nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); + + spblas::spgemm_state_t state; + spblas::multiply_compute(state, d_a, d_b, d_c); + auto nnz = state.result_nnz(); + thrust::device_vector d_c_values(nnz); + thrust::device_vector d_c_colind(nnz); + std::span d_c_values_span(d_c_values.data().get(), nnz); + std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); + std::span d_c_colind_span(d_c_colind.data().get(), nnz); + d_c.update(d_c_values_span, d_c_rowptr_span, d_c_colind_span, {m, n}, + nnz); + + spblas::multiply_fill(state, d_a, d_b, d_c); + + std::vector c_values(nnz); + std::vector c_rowptr(m + 1); + std::vector c_colind(nnz); + thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin()); + thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin()); + thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin()); + spblas::csr_view c(c_values, c_rowptr, + c_colind, {m, n}, nnz); + + spblas::__backend::spa_accumulator c_row_ref( + spblas::__backend::shape(c)[1]); + + spblas::__backend::spa_accumulator c_row_acc( + spblas::__backend::shape(c)[1]); + + for (auto&& [i, a_row] : spblas::__backend::rows(a)) { + c_row_ref.clear(); + for (auto&& [k, a_v] : a_row) { + auto&& b_row = spblas::__backend::lookup_row(b, k); + + for (auto&& [j, b_v] : b_row) { + c_row_ref[j] += a_v * b_v; + } + } + + auto&& c_row = spblas::__backend::lookup_row(c, i); + + // Accumulate output into `c_row_acc` so that we can allow + // duplicate column indices. + c_row_acc.clear(); + for (auto&& [j, c_v] : c_row) { + c_row_acc[j] += c_v; + } + + for (auto&& [j, c_v] : c_row) { + EXPECT_EQ_(c_row_ref[j], c_row_acc[j]); + } + + EXPECT_EQ(c_row_ref.size(), c_row_acc.size()); + } + } + } +} + +TEST(CsrView, SpGEMM_AScaled) { + value_t alpha = 2.0f; + for (auto&& [m, k, nnz] : util::dims) { + for (auto&& n : {m, k}) { + auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] = + spblas::generate_csr(m, k, nnz); + thrust::device_vector d_a_values(a_values); + thrust::device_vector d_a_rowptr(a_rowptr); + thrust::device_vector d_a_colind(a_colind); + spblas::csr_view d_a( + d_a_values.data().get(), d_a_rowptr.data().get(), + d_a_colind.data().get(), a_shape, a_nnz); + spblas::csr_view a(a_values, a_rowptr, + a_colind, a_shape, a_nnz); + + auto [b_values, b_rowptr, b_colind, b_shape, b_nnz] = + spblas::generate_csr(k, n, nnz); + thrust::device_vector d_b_values(b_values); + thrust::device_vector d_b_rowptr(b_rowptr); + thrust::device_vector d_b_colind(b_colind); + spblas::csr_view d_b( + d_b_values.data().get(), d_b_rowptr.data().get(), + d_b_colind.data().get(), b_shape, b_nnz); + spblas::csr_view b(b_values, b_rowptr, + b_colind, b_shape, b_nnz); + + thrust::device_vector d_c_rowptr(m + 1); + + spblas::csr_view d_c( + nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); + + spblas::spgemm_state_t state; + spblas::multiply_compute(state, spblas::scaled(alpha, d_a), d_b, d_c); + auto nnz = state.result_nnz(); + thrust::device_vector d_c_values(nnz); + thrust::device_vector d_c_colind(nnz); + std::span d_c_values_span(d_c_values.data().get(), nnz); + std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); + std::span d_c_colind_span(d_c_colind.data().get(), nnz); + d_c.update(d_c_values_span, d_c_rowptr_span, d_c_colind_span, {m, n}, + nnz); + + spblas::multiply_fill(state, spblas::scaled(alpha, d_a), d_b, d_c); + + std::vector c_values(nnz); + std::vector c_rowptr(m + 1); + std::vector c_colind(nnz); + thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin()); + thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin()); + thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin()); + spblas::csr_view c(c_values, c_rowptr, + c_colind, {m, n}, nnz); + + spblas::__backend::spa_accumulator c_row_ref( + spblas::__backend::shape(c)[1]); + + spblas::__backend::spa_accumulator c_row_acc( + spblas::__backend::shape(c)[1]); + + for (auto&& [i, a_row] : spblas::__backend::rows(a)) { + c_row_ref.clear(); + for (auto&& [k, a_v] : a_row) { + auto&& b_row = spblas::__backend::lookup_row(b, k); + + for (auto&& [j, b_v] : b_row) { + c_row_ref[j] += alpha * a_v * b_v; + } + } + + auto&& c_row = spblas::__backend::lookup_row(c, i); + + // Accumulate output into `c_row_acc` so that we can allow + // duplicate column indices. + c_row_acc.clear(); + for (auto&& [j, c_v] : c_row) { + c_row_acc[j] += c_v; + } + + for (auto&& [j, c_v] : c_row) { + EXPECT_EQ_(c_row_ref[j], c_row_acc[j]); + } + + EXPECT_EQ(c_row_ref.size(), c_row_acc.size()); + } + } + } +} + +TEST(CsrView, SpGEMM_BScaled) { + value_t alpha = 2.0f; + for (auto&& [m, k, nnz] : util::dims) { + for (auto&& n : {m, k}) { + auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] = + spblas::generate_csr(m, k, nnz); + thrust::device_vector d_a_values(a_values); + thrust::device_vector d_a_rowptr(a_rowptr); + thrust::device_vector d_a_colind(a_colind); + spblas::csr_view d_a( + d_a_values.data().get(), d_a_rowptr.data().get(), + d_a_colind.data().get(), a_shape, a_nnz); + spblas::csr_view a(a_values, a_rowptr, + a_colind, a_shape, a_nnz); + + auto [b_values, b_rowptr, b_colind, b_shape, b_nnz] = + spblas::generate_csr(k, n, nnz); + thrust::device_vector d_b_values(b_values); + thrust::device_vector d_b_rowptr(b_rowptr); + thrust::device_vector d_b_colind(b_colind); + spblas::csr_view d_b( + d_b_values.data().get(), d_b_rowptr.data().get(), + d_b_colind.data().get(), b_shape, b_nnz); + spblas::csr_view b(b_values, b_rowptr, + b_colind, b_shape, b_nnz); + + thrust::device_vector d_c_rowptr(m + 1); + + spblas::csr_view d_c( + nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); + + spblas::spgemm_state_t state; + spblas::multiply_compute(state, d_a, spblas::scaled(alpha, d_b), d_c); + auto nnz = state.result_nnz(); + thrust::device_vector d_c_values(nnz); + thrust::device_vector d_c_colind(nnz); + std::span d_c_values_span(d_c_values.data().get(), nnz); + std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); + std::span d_c_colind_span(d_c_colind.data().get(), nnz); + d_c.update(d_c_values_span, d_c_rowptr_span, d_c_colind_span, {m, n}, + nnz); + + spblas::multiply_fill(state, d_a, spblas::scaled(alpha, d_b), d_c); + + std::vector c_values(nnz); + std::vector c_rowptr(m + 1); + std::vector c_colind(nnz); + thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin()); + thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin()); + thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin()); + spblas::csr_view c(c_values, c_rowptr, + c_colind, {m, n}, nnz); + + spblas::__backend::spa_accumulator c_row_ref( + spblas::__backend::shape(c)[1]); + + spblas::__backend::spa_accumulator c_row_acc( + spblas::__backend::shape(c)[1]); + + for (auto&& [i, a_row] : spblas::__backend::rows(a)) { + c_row_ref.clear(); + for (auto&& [k, a_v] : a_row) { + auto&& b_row = spblas::__backend::lookup_row(b, k); + + for (auto&& [j, b_v] : b_row) { + c_row_ref[j] += alpha * a_v * b_v; + } + } + + auto&& c_row = spblas::__backend::lookup_row(c, i); + + // Accumulate output into `c_row_acc` so that we can allow + // duplicate column indices. + c_row_acc.clear(); + for (auto&& [j, c_v] : c_row) { + c_row_acc[j] += c_v; + } + + for (auto&& [j, c_v] : c_row) { + EXPECT_EQ_(c_row_ref[j], c_row_acc[j]); + } + + EXPECT_EQ(c_row_ref.size(), c_row_acc.size()); + } + } + } +} From 59cfabe2dad7a806f68c552bbac2773e96e41fac Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 27 Mar 2025 18:38:26 +0100 Subject: [PATCH 03/14] add rocsparse spgemm with 4 args test --- test/gtest/CMakeLists.txt | 2 +- .../device/rocsparse/spgemm_4args_test.cpp | 421 ++++++++++++++++++ 2 files changed, 422 insertions(+), 1 deletion(-) create mode 100644 test/gtest/device/rocsparse/spgemm_4args_test.cpp diff --git a/test/gtest/CMakeLists.txt b/test/gtest/CMakeLists.txt index 2ac50d3..c3b554b 100644 --- a/test/gtest/CMakeLists.txt +++ b/test/gtest/CMakeLists.txt @@ -23,7 +23,7 @@ if (NOT SPBLAS_GPU_BACKEND) ) else() if(ENABLE_ROCSPARSE) - set(TEST_SOURCES device/spmv_test.cpp device/spgemm_test.cpp) + set(TEST_SOURCES device/spmv_test.cpp device/spgemm_test.cpp device/rocsparse/spgemm_4args_test.cpp) else() set(TEST_SOURCES device/spmv_test.cpp) endif() diff --git a/test/gtest/device/rocsparse/spgemm_4args_test.cpp b/test/gtest/device/rocsparse/spgemm_4args_test.cpp new file mode 100644 index 0000000..f1b272b --- /dev/null +++ b/test/gtest/device/rocsparse/spgemm_4args_test.cpp @@ -0,0 +1,421 @@ + +#include "../../util.hpp" +#include + +#include +#include + +using value_t = float; +using index_t = spblas::index_t; +using offset_t = spblas::offset_t; + +TEST(CsrView, SpGEMM_4Args) { + for (auto&& [m, k, nnz] : util::dims) { + for (auto&& n : {m, k}) { + auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] = + spblas::generate_csr(m, k, nnz); + thrust::device_vector d_a_values(a_values); + thrust::device_vector d_a_rowptr(a_rowptr); + thrust::device_vector d_a_colind(a_colind); + spblas::csr_view d_a( + d_a_values.data().get(), d_a_rowptr.data().get(), + d_a_colind.data().get(), a_shape, a_nnz); + spblas::csr_view a(a_values, a_rowptr, + a_colind, a_shape, a_nnz); + + auto [b_values, b_rowptr, b_colind, b_shape, b_nnz] = + spblas::generate_csr(k, n, nnz); + thrust::device_vector d_b_values(b_values); + thrust::device_vector d_b_rowptr(b_rowptr); + thrust::device_vector d_b_colind(b_colind); + spblas::csr_view d_b( + d_b_values.data().get(), d_b_rowptr.data().get(), + d_b_colind.data().get(), b_shape, b_nnz); + spblas::csr_view b(b_values, b_rowptr, + b_colind, b_shape, b_nnz); + + auto [d_values, d_rowptr, d_colind, d_shape, d_nnz] = + spblas::generate_csr(m, n, nnz); + thrust::device_vector d_d_values(d_values); + thrust::device_vector d_d_rowptr(d_rowptr); + thrust::device_vector d_d_colind(d_colind); + spblas::csr_view d_d( + d_d_values.data().get(), d_d_rowptr.data().get(), + d_d_colind.data().get(), d_shape, d_nnz); + spblas::csr_view d(d_values, d_rowptr, + d_colind, d_shape, d_nnz); + + thrust::device_vector d_c_rowptr(m + 1); + + spblas::csr_view d_c( + nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); + + spblas::spgemm_state_t state; + spblas::multiply_compute(state, d_a, d_b, d_c, d_d); + auto nnz = state.result_nnz(); + thrust::device_vector d_c_values(nnz); + thrust::device_vector d_c_colind(nnz); + std::span d_c_values_span(d_c_values.data().get(), nnz); + std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); + std::span d_c_colind_span(d_c_colind.data().get(), nnz); + d_c.update(d_c_values_span, d_c_rowptr_span, d_c_colind_span, {m, n}, + nnz); + + spblas::multiply_fill(state, d_a, d_b, d_c, d_d); + + std::vector c_values(nnz); + std::vector c_rowptr(m + 1); + std::vector c_colind(nnz); + thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin()); + thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin()); + thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin()); + spblas::csr_view c(c_values, c_rowptr, + c_colind, {m, n}, nnz); + + spblas::__backend::spa_accumulator c_row_ref( + spblas::__backend::shape(c)[1]); + + spblas::__backend::spa_accumulator c_row_acc( + spblas::__backend::shape(c)[1]); + + for (auto&& [i, a_row] : spblas::__backend::rows(a)) { + c_row_ref.clear(); + for (auto&& [k, a_v] : a_row) { + auto&& b_row = spblas::__backend::lookup_row(b, k); + + for (auto&& [j, b_v] : b_row) { + c_row_ref[j] += a_v * b_v; + } + } + auto&& d_row = spblas::__backend::lookup_row(d, i); + for (auto&& [k, d_v] : d_row) { + c_row_ref[k] += d_v; + } + + auto&& c_row = spblas::__backend::lookup_row(c, i); + + // Accumulate output into `c_row_acc` so that we can allow + // duplicate column indices. + c_row_acc.clear(); + for (auto&& [j, c_v] : c_row) { + c_row_acc[j] += c_v; + } + + for (auto&& [j, c_v] : c_row) { + EXPECT_EQ_(c_row_ref[j], c_row_acc[j]); + } + + EXPECT_EQ(c_row_ref.size(), c_row_acc.size()); + } + } + } +} + +TEST(CsrView, SpGEMM_4Args_AScaled) { + value_t alpha = 2.0f; + for (auto&& [m, k, nnz] : util::dims) { + for (auto&& n : {m, k}) { + auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] = + spblas::generate_csr(m, k, nnz); + thrust::device_vector d_a_values(a_values); + thrust::device_vector d_a_rowptr(a_rowptr); + thrust::device_vector d_a_colind(a_colind); + spblas::csr_view d_a( + d_a_values.data().get(), d_a_rowptr.data().get(), + d_a_colind.data().get(), a_shape, a_nnz); + spblas::csr_view a(a_values, a_rowptr, + a_colind, a_shape, a_nnz); + + auto [b_values, b_rowptr, b_colind, b_shape, b_nnz] = + spblas::generate_csr(k, n, nnz); + thrust::device_vector d_b_values(b_values); + thrust::device_vector d_b_rowptr(b_rowptr); + thrust::device_vector d_b_colind(b_colind); + spblas::csr_view d_b( + d_b_values.data().get(), d_b_rowptr.data().get(), + d_b_colind.data().get(), b_shape, b_nnz); + spblas::csr_view b(b_values, b_rowptr, + b_colind, b_shape, b_nnz); + + auto [d_values, d_rowptr, d_colind, d_shape, d_nnz] = + spblas::generate_csr(m, n, nnz); + thrust::device_vector d_d_values(d_values); + thrust::device_vector d_d_rowptr(d_rowptr); + thrust::device_vector d_d_colind(d_colind); + spblas::csr_view d_d( + d_d_values.data().get(), d_d_rowptr.data().get(), + d_d_colind.data().get(), d_shape, d_nnz); + spblas::csr_view d(d_values, d_rowptr, + d_colind, d_shape, d_nnz); + + thrust::device_vector d_c_rowptr(m + 1); + + spblas::csr_view d_c( + nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); + + spblas::spgemm_state_t state; + spblas::multiply_compute(state, scaled(alpha, d_a), d_b, d_c, d_d); + auto nnz = state.result_nnz(); + thrust::device_vector d_c_values(nnz); + thrust::device_vector d_c_colind(nnz); + std::span d_c_values_span(d_c_values.data().get(), nnz); + std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); + std::span d_c_colind_span(d_c_colind.data().get(), nnz); + d_c.update(d_c_values_span, d_c_rowptr_span, d_c_colind_span, {m, n}, + nnz); + + spblas::multiply_fill(state, scaled(alpha, d_a), d_b, d_c, d_d); + + std::vector c_values(nnz); + std::vector c_rowptr(m + 1); + std::vector c_colind(nnz); + thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin()); + thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin()); + thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin()); + spblas::csr_view c(c_values, c_rowptr, + c_colind, {m, n}, nnz); + + spblas::__backend::spa_accumulator c_row_ref( + spblas::__backend::shape(c)[1]); + + spblas::__backend::spa_accumulator c_row_acc( + spblas::__backend::shape(c)[1]); + + for (auto&& [i, a_row] : spblas::__backend::rows(a)) { + c_row_ref.clear(); + for (auto&& [k, a_v] : a_row) { + auto&& b_row = spblas::__backend::lookup_row(b, k); + + for (auto&& [j, b_v] : b_row) { + c_row_ref[j] += alpha * a_v * b_v; + } + } + auto&& d_row = spblas::__backend::lookup_row(d, i); + for (auto&& [k, d_v] : d_row) { + c_row_ref[k] += d_v; + } + + auto&& c_row = spblas::__backend::lookup_row(c, i); + + // Accumulate output into `c_row_acc` so that we can allow + // duplicate column indices. + c_row_acc.clear(); + for (auto&& [j, c_v] : c_row) { + c_row_acc[j] += c_v; + } + + for (auto&& [j, c_v] : c_row) { + EXPECT_EQ_(c_row_ref[j], c_row_acc[j]); + } + + EXPECT_EQ(c_row_ref.size(), c_row_acc.size()); + } + } + } +} + +TEST(CsrView, SpGEMM_4Args_BScaled) { + value_t alpha = 2.0f; + for (auto&& [m, k, nnz] : util::dims) { + for (auto&& n : {m, k}) { + auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] = + spblas::generate_csr(m, k, nnz); + thrust::device_vector d_a_values(a_values); + thrust::device_vector d_a_rowptr(a_rowptr); + thrust::device_vector d_a_colind(a_colind); + spblas::csr_view d_a( + d_a_values.data().get(), d_a_rowptr.data().get(), + d_a_colind.data().get(), a_shape, a_nnz); + spblas::csr_view a(a_values, a_rowptr, + a_colind, a_shape, a_nnz); + + auto [b_values, b_rowptr, b_colind, b_shape, b_nnz] = + spblas::generate_csr(k, n, nnz); + thrust::device_vector d_b_values(b_values); + thrust::device_vector d_b_rowptr(b_rowptr); + thrust::device_vector d_b_colind(b_colind); + spblas::csr_view d_b( + d_b_values.data().get(), d_b_rowptr.data().get(), + d_b_colind.data().get(), b_shape, b_nnz); + spblas::csr_view b(b_values, b_rowptr, + b_colind, b_shape, b_nnz); + + auto [d_values, d_rowptr, d_colind, d_shape, d_nnz] = + spblas::generate_csr(m, n, nnz); + thrust::device_vector d_d_values(d_values); + thrust::device_vector d_d_rowptr(d_rowptr); + thrust::device_vector d_d_colind(d_colind); + spblas::csr_view d_d( + d_d_values.data().get(), d_d_rowptr.data().get(), + d_d_colind.data().get(), d_shape, d_nnz); + spblas::csr_view d(d_values, d_rowptr, + d_colind, d_shape, d_nnz); + + thrust::device_vector d_c_rowptr(m + 1); + + spblas::csr_view d_c( + nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); + + spblas::spgemm_state_t state; + spblas::multiply_compute(state, d_a, scaled(alpha, d_b), d_c, d_d); + auto nnz = state.result_nnz(); + thrust::device_vector d_c_values(nnz); + thrust::device_vector d_c_colind(nnz); + std::span d_c_values_span(d_c_values.data().get(), nnz); + std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); + std::span d_c_colind_span(d_c_colind.data().get(), nnz); + d_c.update(d_c_values_span, d_c_rowptr_span, d_c_colind_span, {m, n}, + nnz); + + spblas::multiply_fill(state, d_a, scaled(alpha, d_b), d_c, d_d); + + std::vector c_values(nnz); + std::vector c_rowptr(m + 1); + std::vector c_colind(nnz); + thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin()); + thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin()); + thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin()); + spblas::csr_view c(c_values, c_rowptr, + c_colind, {m, n}, nnz); + + spblas::__backend::spa_accumulator c_row_ref( + spblas::__backend::shape(c)[1]); + + spblas::__backend::spa_accumulator c_row_acc( + spblas::__backend::shape(c)[1]); + + for (auto&& [i, a_row] : spblas::__backend::rows(a)) { + c_row_ref.clear(); + for (auto&& [k, a_v] : a_row) { + auto&& b_row = spblas::__backend::lookup_row(b, k); + + for (auto&& [j, b_v] : b_row) { + c_row_ref[j] += alpha * a_v * b_v; + } + } + auto&& d_row = spblas::__backend::lookup_row(d, i); + for (auto&& [k, d_v] : d_row) { + c_row_ref[k] += d_v; + } + + auto&& c_row = spblas::__backend::lookup_row(c, i); + + // Accumulate output into `c_row_acc` so that we can allow + // duplicate column indices. + c_row_acc.clear(); + for (auto&& [j, c_v] : c_row) { + c_row_acc[j] += c_v; + } + + for (auto&& [j, c_v] : c_row) { + EXPECT_EQ_(c_row_ref[j], c_row_acc[j]); + } + + EXPECT_EQ(c_row_ref.size(), c_row_acc.size()); + } + } + } +} + +TEST(CsrView, SpGEMM_4Args_DScaled) { + value_t alpha = 2.0f; + for (auto&& [m, k, nnz] : util::dims) { + for (auto&& n : {m, k}) { + auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] = + spblas::generate_csr(m, k, nnz); + thrust::device_vector d_a_values(a_values); + thrust::device_vector d_a_rowptr(a_rowptr); + thrust::device_vector d_a_colind(a_colind); + spblas::csr_view d_a( + d_a_values.data().get(), d_a_rowptr.data().get(), + d_a_colind.data().get(), a_shape, a_nnz); + spblas::csr_view a(a_values, a_rowptr, + a_colind, a_shape, a_nnz); + + auto [b_values, b_rowptr, b_colind, b_shape, b_nnz] = + spblas::generate_csr(k, n, nnz); + thrust::device_vector d_b_values(b_values); + thrust::device_vector d_b_rowptr(b_rowptr); + thrust::device_vector d_b_colind(b_colind); + spblas::csr_view d_b( + d_b_values.data().get(), d_b_rowptr.data().get(), + d_b_colind.data().get(), b_shape, b_nnz); + spblas::csr_view b(b_values, b_rowptr, + b_colind, b_shape, b_nnz); + + auto [d_values, d_rowptr, d_colind, d_shape, d_nnz] = + spblas::generate_csr(m, n, nnz); + thrust::device_vector d_d_values(d_values); + thrust::device_vector d_d_rowptr(d_rowptr); + thrust::device_vector d_d_colind(d_colind); + spblas::csr_view d_d( + d_d_values.data().get(), d_d_rowptr.data().get(), + d_d_colind.data().get(), d_shape, d_nnz); + spblas::csr_view d(d_values, d_rowptr, + d_colind, d_shape, d_nnz); + + thrust::device_vector d_c_rowptr(m + 1); + + spblas::csr_view d_c( + nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); + + spblas::spgemm_state_t state; + spblas::multiply_compute(state, d_a, d_b, d_c, scaled(alpha, d_d)); + auto nnz = state.result_nnz(); + thrust::device_vector d_c_values(nnz); + thrust::device_vector d_c_colind(nnz); + std::span d_c_values_span(d_c_values.data().get(), nnz); + std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); + std::span d_c_colind_span(d_c_colind.data().get(), nnz); + d_c.update(d_c_values_span, d_c_rowptr_span, d_c_colind_span, {m, n}, + nnz); + + spblas::multiply_fill(state, d_a, d_b, d_c, scaled(alpha, d_d)); + + std::vector c_values(nnz); + std::vector c_rowptr(m + 1); + std::vector c_colind(nnz); + thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin()); + thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin()); + thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin()); + spblas::csr_view c(c_values, c_rowptr, + c_colind, {m, n}, nnz); + + spblas::__backend::spa_accumulator c_row_ref( + spblas::__backend::shape(c)[1]); + + spblas::__backend::spa_accumulator c_row_acc( + spblas::__backend::shape(c)[1]); + + for (auto&& [i, a_row] : spblas::__backend::rows(a)) { + c_row_ref.clear(); + for (auto&& [k, a_v] : a_row) { + auto&& b_row = spblas::__backend::lookup_row(b, k); + + for (auto&& [j, b_v] : b_row) { + c_row_ref[j] += a_v * b_v; + } + } + auto&& d_row = spblas::__backend::lookup_row(d, i); + for (auto&& [k, d_v] : d_row) { + c_row_ref[k] += alpha * d_v; + } + + auto&& c_row = spblas::__backend::lookup_row(c, i); + + // Accumulate output into `c_row_acc` so that we can allow + // duplicate column indices. + c_row_acc.clear(); + for (auto&& [j, c_v] : c_row) { + c_row_acc[j] += c_v; + } + + for (auto&& [j, c_v] : c_row) { + EXPECT_EQ_(c_row_ref[j], c_row_acc[j]); + } + + EXPECT_EQ(c_row_ref.size(), c_row_acc.size()); + } + } + } +} From ed7d26f81c97719119f2e47f64672fe05f4b72b8 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 27 Mar 2025 19:16:27 +0100 Subject: [PATCH 04/14] add rocsparse spgemm symbolic/numeric test --- test/gtest/CMakeLists.txt | 2 +- test/gtest/device/spgemm_reuse_test.cpp | 324 ++++++++++++++++++++++++ 2 files changed, 325 insertions(+), 1 deletion(-) create mode 100644 test/gtest/device/spgemm_reuse_test.cpp diff --git a/test/gtest/CMakeLists.txt b/test/gtest/CMakeLists.txt index c3b554b..18de3e6 100644 --- a/test/gtest/CMakeLists.txt +++ b/test/gtest/CMakeLists.txt @@ -23,7 +23,7 @@ if (NOT SPBLAS_GPU_BACKEND) ) else() if(ENABLE_ROCSPARSE) - set(TEST_SOURCES device/spmv_test.cpp device/spgemm_test.cpp device/rocsparse/spgemm_4args_test.cpp) + set(TEST_SOURCES device/spmv_test.cpp device/spgemm_test.cpp device/spgemm_reuse_test.cpp device/rocsparse/spgemm_4args_test.cpp) else() set(TEST_SOURCES device/spmv_test.cpp) endif() diff --git a/test/gtest/device/spgemm_reuse_test.cpp b/test/gtest/device/spgemm_reuse_test.cpp new file mode 100644 index 0000000..931c573 --- /dev/null +++ b/test/gtest/device/spgemm_reuse_test.cpp @@ -0,0 +1,324 @@ + +#include "../util.hpp" +#include + +#include +#include + +using value_t = float; +using index_t = spblas::index_t; +using offset_t = spblas::offset_t; + +TEST(CsrView, SpGEMMReuse) { + for (auto&& [m, k, nnz] : util::dims) { + for (auto&& n : {m, k}) { + auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] = + spblas::generate_csr(m, k, nnz); + thrust::device_vector d_a_values(a_values); + thrust::device_vector d_a_rowptr(a_rowptr); + thrust::device_vector d_a_colind(a_colind); + spblas::csr_view d_a( + d_a_values.data().get(), d_a_rowptr.data().get(), + d_a_colind.data().get(), a_shape, a_nnz); + spblas::csr_view a(a_values, a_rowptr, + a_colind, a_shape, a_nnz); + + auto [b_values, b_rowptr, b_colind, b_shape, b_nnz] = + spblas::generate_csr(k, n, nnz); + thrust::device_vector d_b_values(b_values); + thrust::device_vector d_b_rowptr(b_rowptr); + thrust::device_vector d_b_colind(b_colind); + spblas::csr_view d_b( + d_b_values.data().get(), d_b_rowptr.data().get(), + d_b_colind.data().get(), b_shape, b_nnz); + spblas::csr_view b(b_values, b_rowptr, + b_colind, b_shape, b_nnz); + + thrust::device_vector d_c_rowptr(m + 1); + + spblas::csr_view d_c( + nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); + + spblas::spgemm_state_t state; + spblas::multiply_symbolic_compute(state, d_a, d_b, d_c); + auto nnz = state.result_nnz(); + thrust::device_vector d_c_values(nnz); + thrust::device_vector d_c_colind(nnz); + std::span d_c_values_span(d_c_values.data().get(), nnz); + std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); + std::span d_c_colind_span(d_c_colind.data().get(), nnz); + d_c.update(d_c_values_span, d_c_rowptr_span, d_c_colind_span, {m, n}, + nnz); + + spblas::multiply_symbolic_fill(state, d_a, d_b, d_c); + std::mt19937 g(0); + for (int i = 0; i < 3; i++) { + // we can change the value of a and b but only need to call + // multiply_numeric answer here. + if (i != 0) { + // regenerate value of a and b; + std::uniform_real_distribution val_dist(0.0, 100.0); + for (auto& v : a_values) { + v = val_dist(g); + } + for (auto& v : b_values) { + v = val_dist(g); + } + thrust::copy(a_values.begin(), a_values.end(), d_a_values.begin()); + thrust::copy(b_values.begin(), b_values.end(), d_b_values.begin()); + } + spblas::multiply_numeric(state, d_a, d_b, d_c); + std::vector c_values(nnz); + std::vector c_rowptr(m + 1); + std::vector c_colind(nnz); + thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin()); + thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin()); + thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin()); + spblas::csr_view c(c_values, c_rowptr, + c_colind, {m, n}, nnz); + + spblas::__backend::spa_accumulator c_row_ref( + spblas::__backend::shape(c)[1]); + + spblas::__backend::spa_accumulator c_row_acc( + spblas::__backend::shape(c)[1]); + + for (auto&& [i, a_row] : spblas::__backend::rows(a)) { + c_row_ref.clear(); + for (auto&& [k, a_v] : a_row) { + auto&& b_row = spblas::__backend::lookup_row(b, k); + + for (auto&& [j, b_v] : b_row) { + c_row_ref[j] += a_v * b_v; + } + } + + auto&& c_row = spblas::__backend::lookup_row(c, i); + + // Accumulate output into `c_row_acc` so that we can allow + // duplicate column indices. + c_row_acc.clear(); + for (auto&& [j, c_v] : c_row) { + c_row_acc[j] += c_v; + } + + for (auto&& [j, c_v] : c_row) { + EXPECT_EQ_(c_row_ref[j], c_row_acc[j]); + } + + EXPECT_EQ(c_row_ref.size(), c_row_acc.size()); + } + } + } + } +} + +TEST(CsrView, SpGEMMReuse_AScaled) { + value_t alpha = 2.0f; + for (auto&& [m, k, nnz] : util::dims) { + for (auto&& n : {m, k}) { + auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] = + spblas::generate_csr(m, k, nnz); + thrust::device_vector d_a_values(a_values); + thrust::device_vector d_a_rowptr(a_rowptr); + thrust::device_vector d_a_colind(a_colind); + spblas::csr_view d_a( + d_a_values.data().get(), d_a_rowptr.data().get(), + d_a_colind.data().get(), a_shape, a_nnz); + spblas::csr_view a(a_values, a_rowptr, + a_colind, a_shape, a_nnz); + + auto [b_values, b_rowptr, b_colind, b_shape, b_nnz] = + spblas::generate_csr(k, n, nnz); + thrust::device_vector d_b_values(b_values); + thrust::device_vector d_b_rowptr(b_rowptr); + thrust::device_vector d_b_colind(b_colind); + spblas::csr_view d_b( + d_b_values.data().get(), d_b_rowptr.data().get(), + d_b_colind.data().get(), b_shape, b_nnz); + spblas::csr_view b(b_values, b_rowptr, + b_colind, b_shape, b_nnz); + + thrust::device_vector d_c_rowptr(m + 1); + + spblas::csr_view d_c( + nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); + + spblas::spgemm_state_t state; + spblas::multiply_symbolic_compute(state, scaled(alpha, d_a), d_b, d_c); + auto nnz = state.result_nnz(); + thrust::device_vector d_c_values(nnz); + thrust::device_vector d_c_colind(nnz); + std::span d_c_values_span(d_c_values.data().get(), nnz); + std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); + std::span d_c_colind_span(d_c_colind.data().get(), nnz); + d_c.update(d_c_values_span, d_c_rowptr_span, d_c_colind_span, {m, n}, + nnz); + + spblas::multiply_symbolic_fill(state, scaled(alpha, d_a), d_b, d_c); + std::mt19937 g(0); + for (int i = 0; i < 3; i++) { + // we can change the value of a and b but only need to call + // multiply_numeric answer here. + if (i != 0) { + // regenerate value of a and b; + std::uniform_real_distribution val_dist(0.0, 100.0); + for (auto& v : a_values) { + v = val_dist(g); + } + for (auto& v : b_values) { + v = val_dist(g); + } + thrust::copy(a_values.begin(), a_values.end(), d_a_values.begin()); + thrust::copy(b_values.begin(), b_values.end(), d_b_values.begin()); + } + spblas::multiply_numeric(state, scaled(alpha, d_a), d_b, d_c); + std::vector c_values(nnz); + std::vector c_rowptr(m + 1); + std::vector c_colind(nnz); + thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin()); + thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin()); + thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin()); + spblas::csr_view c(c_values, c_rowptr, + c_colind, {m, n}, nnz); + + spblas::__backend::spa_accumulator c_row_ref( + spblas::__backend::shape(c)[1]); + + spblas::__backend::spa_accumulator c_row_acc( + spblas::__backend::shape(c)[1]); + + for (auto&& [i, a_row] : spblas::__backend::rows(a)) { + c_row_ref.clear(); + for (auto&& [k, a_v] : a_row) { + auto&& b_row = spblas::__backend::lookup_row(b, k); + + for (auto&& [j, b_v] : b_row) { + c_row_ref[j] += alpha * a_v * b_v; + } + } + + auto&& c_row = spblas::__backend::lookup_row(c, i); + + // Accumulate output into `c_row_acc` so that we can allow + // duplicate column indices. + c_row_acc.clear(); + for (auto&& [j, c_v] : c_row) { + c_row_acc[j] += c_v; + } + + for (auto&& [j, c_v] : c_row) { + EXPECT_EQ_(c_row_ref[j], c_row_acc[j]); + } + + EXPECT_EQ(c_row_ref.size(), c_row_acc.size()); + } + } + } + } +} + +TEST(CsrView, SpGEMMReuse_BScaled) { + value_t alpha = 2.0f; + for (auto&& [m, k, nnz] : util::dims) { + for (auto&& n : {m, k}) { + auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] = + spblas::generate_csr(m, k, nnz); + thrust::device_vector d_a_values(a_values); + thrust::device_vector d_a_rowptr(a_rowptr); + thrust::device_vector d_a_colind(a_colind); + spblas::csr_view d_a( + d_a_values.data().get(), d_a_rowptr.data().get(), + d_a_colind.data().get(), a_shape, a_nnz); + spblas::csr_view a(a_values, a_rowptr, + a_colind, a_shape, a_nnz); + + auto [b_values, b_rowptr, b_colind, b_shape, b_nnz] = + spblas::generate_csr(k, n, nnz); + thrust::device_vector d_b_values(b_values); + thrust::device_vector d_b_rowptr(b_rowptr); + thrust::device_vector d_b_colind(b_colind); + spblas::csr_view d_b( + d_b_values.data().get(), d_b_rowptr.data().get(), + d_b_colind.data().get(), b_shape, b_nnz); + spblas::csr_view b(b_values, b_rowptr, + b_colind, b_shape, b_nnz); + + thrust::device_vector d_c_rowptr(m + 1); + + spblas::csr_view d_c( + nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); + + spblas::spgemm_state_t state; + spblas::multiply_symbolic_compute(state, d_a, scaled(alpha, d_b), d_c); + auto nnz = state.result_nnz(); + thrust::device_vector d_c_values(nnz); + thrust::device_vector d_c_colind(nnz); + std::span d_c_values_span(d_c_values.data().get(), nnz); + std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); + std::span d_c_colind_span(d_c_colind.data().get(), nnz); + d_c.update(d_c_values_span, d_c_rowptr_span, d_c_colind_span, {m, n}, + nnz); + + spblas::multiply_symbolic_fill(state, d_a, scaled(alpha, d_b), d_c); + std::mt19937 g(0); + for (int i = 0; i < 3; i++) { + // we can change the value of a and b but only need to call + // multiply_numeric answer here. + if (i != 0) { + // regenerate value of a and b; + std::uniform_real_distribution val_dist(0.0, 100.0); + for (auto& v : a_values) { + v = val_dist(g); + } + for (auto& v : b_values) { + v = val_dist(g); + } + thrust::copy(a_values.begin(), a_values.end(), d_a_values.begin()); + thrust::copy(b_values.begin(), b_values.end(), d_b_values.begin()); + } + spblas::multiply_numeric(state, d_a, scaled(alpha, d_b), d_c); + std::vector c_values(nnz); + std::vector c_rowptr(m + 1); + std::vector c_colind(nnz); + thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin()); + thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin()); + thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin()); + spblas::csr_view c(c_values, c_rowptr, + c_colind, {m, n}, nnz); + + spblas::__backend::spa_accumulator c_row_ref( + spblas::__backend::shape(c)[1]); + + spblas::__backend::spa_accumulator c_row_acc( + spblas::__backend::shape(c)[1]); + + for (auto&& [i, a_row] : spblas::__backend::rows(a)) { + c_row_ref.clear(); + for (auto&& [k, a_v] : a_row) { + auto&& b_row = spblas::__backend::lookup_row(b, k); + + for (auto&& [j, b_v] : b_row) { + c_row_ref[j] += alpha * a_v * b_v; + } + } + + auto&& c_row = spblas::__backend::lookup_row(c, i); + + // Accumulate output into `c_row_acc` so that we can allow + // duplicate column indices. + c_row_acc.clear(); + for (auto&& [j, c_v] : c_row) { + c_row_acc[j] += c_v; + } + + for (auto&& [j, c_v] : c_row) { + EXPECT_EQ_(c_row_ref[j], c_row_acc[j]); + } + + EXPECT_EQ(c_row_ref.size(), c_row_acc.size()); + } + } + } + } +} From e7ecada6d7f0a8b808077a231d80e079402560e9 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 17 Apr 2025 16:43:16 +0200 Subject: [PATCH 05/14] add the test when changing the pointer --- .../vendor/rocsparse/multiply_spgemm.hpp | 17 +++ test/gtest/device/spgemm_reuse_test.cpp | 124 ++++++++++++++++++ 2 files changed, 141 insertions(+) diff --git a/include/spblas/vendor/rocsparse/multiply_spgemm.hpp b/include/spblas/vendor/rocsparse/multiply_spgemm.hpp index 7f215df..74e6f70 100644 --- a/include/spblas/vendor/rocsparse/multiply_spgemm.hpp +++ b/include/spblas/vendor/rocsparse/multiply_spgemm.hpp @@ -202,6 +202,7 @@ class spgemm_state_t { void multiply_numeric(A&& a, B&& b, C&& c, D&& d) { auto a_base = __detail::get_ultimate_base(a); auto b_base = __detail::get_ultimate_base(b); + auto d_base = __detail::get_ultimate_base(d); using matrix_type = decltype(a_base); using input_type = decltype(b_base); using output_type = std::remove_reference_t; @@ -213,6 +214,22 @@ class spgemm_state_t { auto beta_optional = __detail::get_scaling_factor(d); value_type beta = beta_optional.value_or(1); + // Update the pointer from the matrix but they must contains the same + // sparsity as the previous call. + __rocsparse::throw_if_error(rocsparse_csr_set_pointers( + this->mat_a_, a_base.rowptr().data(), a_base.colind().data(), + a_base.values().data())); + __rocsparse::throw_if_error(rocsparse_csr_set_pointers( + this->mat_b_, b_base.rowptr().data(), b_base.colind().data(), + b_base.values().data())); + __rocsparse::throw_if_error(rocsparse_csr_set_pointers( + this->mat_c_, c.rowptr().data(), c.colind().data(), c.values().data())); + if (d_base.values().data()) { + // when it is still a null matrix, we can not use set pointer function + __rocsparse::throw_if_error(rocsparse_csr_set_pointers( + this->mat_d_, d_base.rowptr().data(), d_base.colind().data(), + d_base.values().data())); + } __rocsparse::throw_if_error(rocsparse_spgemm( this->handle_.get(), rocsparse_operation_none, rocsparse_operation_none, &alpha, this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_, diff --git a/test/gtest/device/spgemm_reuse_test.cpp b/test/gtest/device/spgemm_reuse_test.cpp index 931c573..836190e 100644 --- a/test/gtest/device/spgemm_reuse_test.cpp +++ b/test/gtest/device/spgemm_reuse_test.cpp @@ -322,3 +322,127 @@ TEST(CsrView, SpGEMMReuse_BScaled) { } } } + +TEST(CsrView, SpGEMMReuseAndChangePointer) { + for (auto&& [m, k, nnz] : util::dims) { + for (auto&& n : {m, k}) { + auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] = + spblas::generate_csr(m, k, nnz); + thrust::device_vector d_a_values(a_values); + thrust::device_vector d_a_rowptr(a_rowptr); + thrust::device_vector d_a_colind(a_colind); + spblas::csr_view d_a( + d_a_values.data().get(), d_a_rowptr.data().get(), + d_a_colind.data().get(), a_shape, a_nnz); + spblas::csr_view a(a_values, a_rowptr, + a_colind, a_shape, a_nnz); + + auto [b_values, b_rowptr, b_colind, b_shape, b_nnz] = + spblas::generate_csr(k, n, nnz); + thrust::device_vector d_b_values(b_values); + thrust::device_vector d_b_rowptr(b_rowptr); + thrust::device_vector d_b_colind(b_colind); + spblas::csr_view d_b( + d_b_values.data().get(), d_b_rowptr.data().get(), + d_b_colind.data().get(), b_shape, b_nnz); + spblas::csr_view b(b_values, b_rowptr, + b_colind, b_shape, b_nnz); + + thrust::device_vector d_c_rowptr(m + 1); + + spblas::csr_view d_c( + nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); + + spblas::spgemm_state_t state; + spblas::multiply_symbolic_compute(state, d_a, d_b, d_c); + auto nnz = state.result_nnz(); + thrust::device_vector d_c_values(nnz); + thrust::device_vector d_c_colind(nnz); + std::span d_c_values_span(d_c_values.data().get(), nnz); + std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); + std::span d_c_colind_span(d_c_colind.data().get(), nnz); + d_c.update(d_c_values_span, d_c_rowptr_span, d_c_colind_span, {m, n}, + nnz); + + spblas::multiply_symbolic_fill(state, d_a, d_b, d_c); + std::mt19937 g(0); + for (int i = 0; i < 3; i++) { + // regenerate value of a and b; + std::uniform_real_distribution val_dist(0.0, 100.0); + for (auto& v : a_values) { + v = val_dist(g); + } + for (auto& v : b_values) { + v = val_dist(g); + } + // create different pointers than the symbolic phase, but they still + // hold the same sparsity + thrust::device_vector d_a_values_new(a_values); + thrust::device_vector d_a_colind_new(d_a_colind); + thrust::device_vector d_a_rowptr_new(d_a_rowptr); + thrust::device_vector d_b_values_new(b_values); + thrust::device_vector d_b_colind_new(d_b_colind); + thrust::device_vector d_b_rowptr_new(d_b_rowptr); + thrust::device_vector d_c_values_new(d_c_values); + thrust::device_vector d_c_colind_new(d_c_colind); + thrust::device_vector d_c_rowptr_new(d_c_rowptr); + spblas::csr_view d_a( + d_a_values_new.data().get(), d_a_rowptr_new.data().get(), + d_a_colind_new.data().get(), a_shape, a_nnz); + spblas::csr_view d_b( + d_b_values_new.data().get(), d_b_rowptr_new.data().get(), + d_b_colind_new.data().get(), b_shape, b_nnz); + spblas::csr_view d_c( + d_c_values_new.data().get(), d_c_rowptr_new.data().get(), + d_c_colind_new.data().get(), {m, n}, nnz); + // call numeric on new data + spblas::multiply_numeric(state, d_a, d_b, d_c); + // move c back to host memory + std::vector c_values(nnz); + std::vector c_rowptr(m + 1); + std::vector c_colind(nnz); + thrust::copy(d_c_values_new.begin(), d_c_values_new.end(), + c_values.begin()); + thrust::copy(d_c_rowptr_new.begin(), d_c_rowptr_new.end(), + c_rowptr.begin()); + thrust::copy(d_c_colind_new.begin(), d_c_colind_new.end(), + c_colind.begin()); + + spblas::csr_view c(c_values, c_rowptr, + c_colind, {m, n}, nnz); + + spblas::__backend::spa_accumulator c_row_ref( + spblas::__backend::shape(c)[1]); + + spblas::__backend::spa_accumulator c_row_acc( + spblas::__backend::shape(c)[1]); + + for (auto&& [i, a_row] : spblas::__backend::rows(a)) { + c_row_ref.clear(); + for (auto&& [k, a_v] : a_row) { + auto&& b_row = spblas::__backend::lookup_row(b, k); + + for (auto&& [j, b_v] : b_row) { + c_row_ref[j] += a_v * b_v; + } + } + + auto&& c_row = spblas::__backend::lookup_row(c, i); + + // Accumulate output into `c_row_acc` so that we can allow + // duplicate column indices. + c_row_acc.clear(); + for (auto&& [j, c_v] : c_row) { + c_row_acc[j] += c_v; + } + + for (auto&& [j, c_v] : c_row) { + EXPECT_EQ_(c_row_ref[j], c_row_acc[j]); + } + + EXPECT_EQ(c_row_ref.size(), c_row_acc.size()); + } + } + } + } +} From 4595c8d43ca74f2101066ab54a47f2bc8e0a9f9e Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 8 May 2025 00:27:58 +0200 Subject: [PATCH 06/14] create a matrix descriptor helper --- .../spblas/vendor/rocsparse/descriptor.hpp | 33 +++++++++++++++++ include/spblas/vendor/rocsparse/multiply.hpp | 10 ++---- .../vendor/rocsparse/multiply_spgemm.hpp | 35 +++---------------- 3 files changed, 40 insertions(+), 38 deletions(-) create mode 100644 include/spblas/vendor/rocsparse/descriptor.hpp diff --git a/include/spblas/vendor/rocsparse/descriptor.hpp b/include/spblas/vendor/rocsparse/descriptor.hpp new file mode 100644 index 0000000..e47e073 --- /dev/null +++ b/include/spblas/vendor/rocsparse/descriptor.hpp @@ -0,0 +1,33 @@ +#pragma once + +#include + +#include + +#include +#include + +#include "exception.hpp" +#include "types.hpp" + +namespace spblas { +namespace __rocsparse { + +// create matrix descriptor from spblas csr view +template + requires __detail::is_csr_view_v +rocsparse_spmat_descr create_matrix_descr(mat&& a) { + using matrix_type = std::remove_cvref_t; + rocsparse_spmat_descr descr; + throw_if_error(rocsparse_create_csr_descr( + &descr, __backend::shape(a)[0], __backend::shape(a)[1], a.values().size(), + a.rowptr().data(), a.colind().data(), a.values().data(), + to_rocsparse_indextype(), + to_rocsparse_indextype(), + rocsparse_index_base_zero, + to_rocsparse_datatype())); + return descr; +} + +} // namespace __rocsparse +} // namespace spblas diff --git a/include/spblas/vendor/rocsparse/multiply.hpp b/include/spblas/vendor/rocsparse/multiply.hpp index 7d88c5c..212a5d1 100644 --- a/include/spblas/vendor/rocsparse/multiply.hpp +++ b/include/spblas/vendor/rocsparse/multiply.hpp @@ -10,6 +10,7 @@ #include #include +#include "descriptor.hpp" #include "exception.hpp" #include "hip_allocator.hpp" #include "types.hpp" @@ -59,14 +60,7 @@ class spmv_state_t { tensor_scalar_t alpha = alpha_optional.value_or(1); auto handle = this->handle_.get(); - rocsparse_spmat_descr mat; - __rocsparse::throw_if_error(rocsparse_create_csr_descr( - &mat, __backend::shape(a_base)[0], __backend::shape(a_base)[1], - a_base.values().size(), a_base.rowptr().data(), a_base.colind().data(), - a_base.values().data(), - to_rocsparse_indextype(), - to_rocsparse_indextype(), - rocsparse_index_base_zero, to_rocsparse_datatype())); + rocsparse_spmat_descr mat = __rocsparse::create_matrix_descr(a_base); rocsparse_dnvec_descr vecb; rocsparse_dnvec_descr vecc; __rocsparse::throw_if_error(rocsparse_create_dnvec_descr( diff --git a/include/spblas/vendor/rocsparse/multiply_spgemm.hpp b/include/spblas/vendor/rocsparse/multiply_spgemm.hpp index 74e6f70..5eae63e 100644 --- a/include/spblas/vendor/rocsparse/multiply_spgemm.hpp +++ b/include/spblas/vendor/rocsparse/multiply_spgemm.hpp @@ -10,6 +10,7 @@ #include #include +#include "descriptor.hpp" #include "exception.hpp" #include "hip_allocator.hpp" #include "types.hpp" @@ -85,36 +86,10 @@ class spgemm_state_t { value_type beta = beta_optional.value_or(1); auto handle = this->handle_.get(); // Create sparse matrix A in CSR format - __rocsparse::throw_if_error(rocsparse_create_csr_descr( - &this->mat_a_, __backend::shape(a_base)[0], __backend::shape(a_base)[1], - a_base.values().size(), a_base.rowptr().data(), a_base.colind().data(), - a_base.values().data(), - to_rocsparse_indextype(), - to_rocsparse_indextype(), - rocsparse_index_base_zero, to_rocsparse_datatype())); - __rocsparse::throw_if_error(rocsparse_create_csr_descr( - &this->mat_b_, __backend::shape(b_base)[0], __backend::shape(b_base)[1], - b_base.values().size(), b_base.rowptr().data(), b_base.colind().data(), - b_base.values().data(), - to_rocsparse_indextype(), - to_rocsparse_indextype(), - rocsparse_index_base_zero, - to_rocsparse_datatype())); - __rocsparse::throw_if_error(rocsparse_create_csr_descr( - &this->mat_c_, __backend::shape(a_base)[0], __backend::shape(b_base)[1], - 0, c.rowptr().data(), NULL, NULL, - to_rocsparse_indextype(), - to_rocsparse_indextype(), - rocsparse_index_base_zero, - to_rocsparse_datatype())); - __rocsparse::throw_if_error(rocsparse_create_csr_descr( - &this->mat_d_, __backend::shape(d_base)[0], __backend::shape(d_base)[1], - d_base.values().size(), d_base.rowptr().data(), d_base.colind().data(), - d_base.values().data(), - to_rocsparse_indextype(), - to_rocsparse_indextype(), - rocsparse_index_base_zero, - to_rocsparse_datatype())); + this->mat_a_ = __rocsparse::create_matrix_descr(a_base); + this->mat_b_ = __rocsparse::create_matrix_descr(b_base); + this->mat_c_ = __rocsparse::create_matrix_descr(c); + this->mat_d_ = __rocsparse::create_matrix_descr(d_base); //-------------------------------------------------------------------------- // SpGEMM Computation // ask buffer_size bytes for external memory From eeb3a674fb995d3e7a41b919e4c7197c78ddc703 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 8 May 2025 00:36:35 +0200 Subject: [PATCH 07/14] add vector descriptor helper --- include/spblas/vendor/rocsparse/descriptor.hpp | 12 ++++++++++++ include/spblas/vendor/rocsparse/multiply.hpp | 10 ++-------- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/include/spblas/vendor/rocsparse/descriptor.hpp b/include/spblas/vendor/rocsparse/descriptor.hpp index e47e073..cf45934 100644 --- a/include/spblas/vendor/rocsparse/descriptor.hpp +++ b/include/spblas/vendor/rocsparse/descriptor.hpp @@ -29,5 +29,17 @@ rocsparse_spmat_descr create_matrix_descr(mat&& a) { return descr; } +// create dense vector from mdspan +template + requires __ranges::contiguous_range +rocsparse_dnvec_descr create_vector_descr(vec&& v) { + using vector_type = std::remove_cvref_t; + rocsparse_dnvec_descr descr; + throw_if_error(rocsparse_create_dnvec_descr( + &descr, v.size(), v.data(), + to_rocsparse_datatype())); + return descr; +} + } // namespace __rocsparse } // namespace spblas diff --git a/include/spblas/vendor/rocsparse/multiply.hpp b/include/spblas/vendor/rocsparse/multiply.hpp index 212a5d1..d05ff98 100644 --- a/include/spblas/vendor/rocsparse/multiply.hpp +++ b/include/spblas/vendor/rocsparse/multiply.hpp @@ -61,14 +61,8 @@ class spmv_state_t { auto handle = this->handle_.get(); rocsparse_spmat_descr mat = __rocsparse::create_matrix_descr(a_base); - rocsparse_dnvec_descr vecb; - rocsparse_dnvec_descr vecc; - __rocsparse::throw_if_error(rocsparse_create_dnvec_descr( - &vecb, b_base.size(), b_base.data(), - to_rocsparse_datatype())); - __rocsparse::throw_if_error(rocsparse_create_dnvec_descr( - &vecc, c.size(), c.data(), - to_rocsparse_datatype())); + rocsparse_dnvec_descr vecb = __rocsparse::create_vector_descr(b_base); + rocsparse_dnvec_descr vecc = __rocsparse::create_vector_descr(c); value_type alpha_val = alpha; value_type beta = 0.0; long unsigned int buffer_size = 0; From 59c0b7d0aac5e44ece6edd299a22f36b0a138605 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 22 May 2025 16:24:31 +0200 Subject: [PATCH 08/14] review update --- include/spblas/vendor/rocsparse/multiply_spgemm.hpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/include/spblas/vendor/rocsparse/multiply_spgemm.hpp b/include/spblas/vendor/rocsparse/multiply_spgemm.hpp index 5eae63e..c6ede83 100644 --- a/include/spblas/vendor/rocsparse/multiply_spgemm.hpp +++ b/include/spblas/vendor/rocsparse/multiply_spgemm.hpp @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -85,19 +86,19 @@ class spgemm_state_t { auto beta_optional = __detail::get_scaling_factor(d); value_type beta = beta_optional.value_or(1); auto handle = this->handle_.get(); - // Create sparse matrix A in CSR format + // Create sparse matrix in CSR format this->mat_a_ = __rocsparse::create_matrix_descr(a_base); this->mat_b_ = __rocsparse::create_matrix_descr(b_base); this->mat_c_ = __rocsparse::create_matrix_descr(c); this->mat_d_ = __rocsparse::create_matrix_descr(d_base); - //-------------------------------------------------------------------------- - // SpGEMM Computation // ask buffer_size bytes for external memory __rocsparse::throw_if_error(rocsparse_spgemm( handle, rocsparse_operation_none, rocsparse_operation_none, &alpha, this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_, to_rocsparse_datatype(), rocsparse_spgemm_alg_default, rocsparse_spgemm_stage_buffer_size, &buffer_size, nullptr)); + // allocate the new buffer if it requires more than what the buffer + // currently has. if (buffer_size > this->buffer_size_) { this->alloc_.deallocate(workspace_, this->buffer_size_); this->buffer_size_ = buffer_size; @@ -113,6 +114,7 @@ class spgemm_state_t { int64_t c_num_cols; __rocsparse::throw_if_error(rocsparse_spmat_get_size( this->mat_c_, &c_num_rows, &c_num_cols, &this->result_nnz_)); + // form a shape this->result_shape_ = index(c_num_rows, c_num_cols); } @@ -218,10 +220,10 @@ class spgemm_state_t { std::function>; handle_manager handle_; rocsparse::hip_allocator alloc_; - long unsigned int buffer_size_; + std::uint64_t buffer_size_; char* workspace_; index result_shape_; - index_t result_nnz_; + std::int64_t result_nnz_; rocsparse_spmat_descr mat_a_; rocsparse_spmat_descr mat_b_; rocsparse_spmat_descr mat_c_; From 5f9c0905727f4efcad83e1153954e0ea0399ef3b Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 7 Aug 2025 16:13:58 +0200 Subject: [PATCH 09/14] fix and adapt the changes from main --- .../spblas/vendor/rocsparse/descriptor.hpp | 45 ------------------- .../vendor/rocsparse/multiply_spgemm.hpp | 19 ++++---- .../device/rocsparse/spgemm_4args_test.cpp | 8 ++-- test/gtest/device/spgemm_reuse_test.cpp | 8 ++-- test/gtest/device/spgemm_test.cpp | 6 +-- 5 files changed, 20 insertions(+), 66 deletions(-) delete mode 100644 include/spblas/vendor/rocsparse/descriptor.hpp diff --git a/include/spblas/vendor/rocsparse/descriptor.hpp b/include/spblas/vendor/rocsparse/descriptor.hpp deleted file mode 100644 index cf45934..0000000 --- a/include/spblas/vendor/rocsparse/descriptor.hpp +++ /dev/null @@ -1,45 +0,0 @@ -#pragma once - -#include - -#include - -#include -#include - -#include "exception.hpp" -#include "types.hpp" - -namespace spblas { -namespace __rocsparse { - -// create matrix descriptor from spblas csr view -template - requires __detail::is_csr_view_v -rocsparse_spmat_descr create_matrix_descr(mat&& a) { - using matrix_type = std::remove_cvref_t; - rocsparse_spmat_descr descr; - throw_if_error(rocsparse_create_csr_descr( - &descr, __backend::shape(a)[0], __backend::shape(a)[1], a.values().size(), - a.rowptr().data(), a.colind().data(), a.values().data(), - to_rocsparse_indextype(), - to_rocsparse_indextype(), - rocsparse_index_base_zero, - to_rocsparse_datatype())); - return descr; -} - -// create dense vector from mdspan -template - requires __ranges::contiguous_range -rocsparse_dnvec_descr create_vector_descr(vec&& v) { - using vector_type = std::remove_cvref_t; - rocsparse_dnvec_descr descr; - throw_if_error(rocsparse_create_dnvec_descr( - &descr, v.size(), v.data(), - to_rocsparse_datatype())); - return descr; -} - -} // namespace __rocsparse -} // namespace spblas diff --git a/include/spblas/vendor/rocsparse/multiply_spgemm.hpp b/include/spblas/vendor/rocsparse/multiply_spgemm.hpp index c6ede83..0b58f37 100644 --- a/include/spblas/vendor/rocsparse/multiply_spgemm.hpp +++ b/include/spblas/vendor/rocsparse/multiply_spgemm.hpp @@ -11,7 +11,6 @@ #include #include -#include "descriptor.hpp" #include "exception.hpp" #include "hip_allocator.hpp" #include "types.hpp" @@ -87,15 +86,15 @@ class spgemm_state_t { value_type beta = beta_optional.value_or(1); auto handle = this->handle_.get(); // Create sparse matrix in CSR format - this->mat_a_ = __rocsparse::create_matrix_descr(a_base); - this->mat_b_ = __rocsparse::create_matrix_descr(b_base); - this->mat_c_ = __rocsparse::create_matrix_descr(c); - this->mat_d_ = __rocsparse::create_matrix_descr(d_base); + this->mat_a_ = __rocsparse::create_rocsparse_handle(a_base); + this->mat_b_ = __rocsparse::create_rocsparse_handle(b_base); + this->mat_c_ = __rocsparse::create_rocsparse_handle(c); + this->mat_d_ = __rocsparse::create_rocsparse_handle(d_base); // ask buffer_size bytes for external memory __rocsparse::throw_if_error(rocsparse_spgemm( handle, rocsparse_operation_none, rocsparse_operation_none, &alpha, this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_, - to_rocsparse_datatype(), rocsparse_spgemm_alg_default, + detail::rocsparse_data_type_v, rocsparse_spgemm_alg_default, rocsparse_spgemm_stage_buffer_size, &buffer_size, nullptr)); // allocate the new buffer if it requires more than what the buffer // currently has. @@ -107,7 +106,7 @@ class spgemm_state_t { __rocsparse::throw_if_error(rocsparse_spgemm( handle, rocsparse_operation_none, rocsparse_operation_none, &alpha, this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_, - to_rocsparse_datatype(), rocsparse_spgemm_alg_default, + detail::rocsparse_data_type_v, rocsparse_spgemm_alg_default, rocsparse_spgemm_stage_nnz, &this->buffer_size_, this->workspace_)); // get matrix C non-zero entries and size int64_t c_num_rows; @@ -141,7 +140,7 @@ class spgemm_state_t { __rocsparse::throw_if_error(rocsparse_spgemm( handle_.get(), rocsparse_operation_none, rocsparse_operation_none, &alpha, this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_, - to_rocsparse_datatype(), rocsparse_spgemm_alg_default, + detail::rocsparse_data_type_v, rocsparse_spgemm_alg_default, rocsparse_spgemm_stage_compute, &this->buffer_size_, workspace_)); } @@ -168,7 +167,7 @@ class spgemm_state_t { __rocsparse::throw_if_error(rocsparse_spgemm( this->handle_.get(), rocsparse_operation_none, rocsparse_operation_none, &alpha, this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_, - to_rocsparse_datatype(), rocsparse_spgemm_alg_default, + detail::rocsparse_data_type_v, rocsparse_spgemm_alg_default, rocsparse_spgemm_stage_symbolic, &this->buffer_size_, this->workspace_)); } @@ -210,7 +209,7 @@ class spgemm_state_t { __rocsparse::throw_if_error(rocsparse_spgemm( this->handle_.get(), rocsparse_operation_none, rocsparse_operation_none, &alpha, this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_, - to_rocsparse_datatype(), rocsparse_spgemm_alg_default, + detail::rocsparse_data_type_v, rocsparse_spgemm_alg_default, rocsparse_spgemm_stage_numeric, &this->buffer_size_, this->workspace_)); } diff --git a/test/gtest/device/rocsparse/spgemm_4args_test.cpp b/test/gtest/device/rocsparse/spgemm_4args_test.cpp index f1b272b..1a8e75a 100644 --- a/test/gtest/device/rocsparse/spgemm_4args_test.cpp +++ b/test/gtest/device/rocsparse/spgemm_4args_test.cpp @@ -9,7 +9,7 @@ using value_t = float; using index_t = spblas::index_t; using offset_t = spblas::offset_t; -TEST(CsrView, SpGEMM_4Args) { +TEST(thrust_CsrView, SpGEMM_4Args) { for (auto&& [m, k, nnz] : util::dims) { for (auto&& n : {m, k}) { auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] = @@ -111,7 +111,7 @@ TEST(CsrView, SpGEMM_4Args) { } } -TEST(CsrView, SpGEMM_4Args_AScaled) { +TEST(thrust_CsrView, SpGEMM_4Args_AScaled) { value_t alpha = 2.0f; for (auto&& [m, k, nnz] : util::dims) { for (auto&& n : {m, k}) { @@ -214,7 +214,7 @@ TEST(CsrView, SpGEMM_4Args_AScaled) { } } -TEST(CsrView, SpGEMM_4Args_BScaled) { +TEST(thrust_CsrView, SpGEMM_4Args_BScaled) { value_t alpha = 2.0f; for (auto&& [m, k, nnz] : util::dims) { for (auto&& n : {m, k}) { @@ -317,7 +317,7 @@ TEST(CsrView, SpGEMM_4Args_BScaled) { } } -TEST(CsrView, SpGEMM_4Args_DScaled) { +TEST(thrust_CsrView, SpGEMM_4Args_DScaled) { value_t alpha = 2.0f; for (auto&& [m, k, nnz] : util::dims) { for (auto&& n : {m, k}) { diff --git a/test/gtest/device/spgemm_reuse_test.cpp b/test/gtest/device/spgemm_reuse_test.cpp index 836190e..b798d6e 100644 --- a/test/gtest/device/spgemm_reuse_test.cpp +++ b/test/gtest/device/spgemm_reuse_test.cpp @@ -9,7 +9,7 @@ using value_t = float; using index_t = spblas::index_t; using offset_t = spblas::offset_t; -TEST(CsrView, SpGEMMReuse) { +TEST(thrust_CsrView, SpGEMMReuse) { for (auto&& [m, k, nnz] : util::dims) { for (auto&& n : {m, k}) { auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] = @@ -113,7 +113,7 @@ TEST(CsrView, SpGEMMReuse) { } } -TEST(CsrView, SpGEMMReuse_AScaled) { +TEST(thrust_CsrView, SpGEMMReuse_AScaled) { value_t alpha = 2.0f; for (auto&& [m, k, nnz] : util::dims) { for (auto&& n : {m, k}) { @@ -218,7 +218,7 @@ TEST(CsrView, SpGEMMReuse_AScaled) { } } -TEST(CsrView, SpGEMMReuse_BScaled) { +TEST(thrust_CsrView, SpGEMMReuse_BScaled) { value_t alpha = 2.0f; for (auto&& [m, k, nnz] : util::dims) { for (auto&& n : {m, k}) { @@ -323,7 +323,7 @@ TEST(CsrView, SpGEMMReuse_BScaled) { } } -TEST(CsrView, SpGEMMReuseAndChangePointer) { +TEST(thrust_CsrView, SpGEMMReuseAndChangePointer) { for (auto&& [m, k, nnz] : util::dims) { for (auto&& n : {m, k}) { auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] = diff --git a/test/gtest/device/spgemm_test.cpp b/test/gtest/device/spgemm_test.cpp index 4662ffd..98e1aed 100644 --- a/test/gtest/device/spgemm_test.cpp +++ b/test/gtest/device/spgemm_test.cpp @@ -9,7 +9,7 @@ using value_t = float; using index_t = spblas::index_t; using offset_t = spblas::offset_t; -TEST(CsrView, SpGEMM) { +TEST(thrust_CsrView, SpGEMM) { for (auto&& [m, k, nnz] : util::dims) { for (auto&& n : {m, k}) { auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] = @@ -96,7 +96,7 @@ TEST(CsrView, SpGEMM) { } } -TEST(CsrView, SpGEMM_AScaled) { +TEST(thrust_CsrView, SpGEMM_AScaled) { value_t alpha = 2.0f; for (auto&& [m, k, nnz] : util::dims) { for (auto&& n : {m, k}) { @@ -184,7 +184,7 @@ TEST(CsrView, SpGEMM_AScaled) { } } -TEST(CsrView, SpGEMM_BScaled) { +TEST(thrust_CsrView, SpGEMM_BScaled) { value_t alpha = 2.0f; for (auto&& [m, k, nnz] : util::dims) { for (auto&& n : {m, k}) { From e36a51616e67c1fcb52f26d2987bc5ac50e80510 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Wed, 20 Aug 2025 16:08:22 +0200 Subject: [PATCH 10/14] fix meaningless cmake function Co-authored-by: Benjamin Brock --- test/gtest/CMakeLists.txt | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/test/gtest/CMakeLists.txt b/test/gtest/CMakeLists.txt index 84248cf..cd96de1 100644 --- a/test/gtest/CMakeLists.txt +++ b/test/gtest/CMakeLists.txt @@ -13,16 +13,12 @@ if (SPBLAS_CPU_BACKEND) transpose_test.cpp triangular_solve_test.cpp) endif() -function(add_rocsparse_lang file_list) - if (ENABLE_ROCSPARSE) - set_source_files_properties(${${file_list}} PROPERTIES LANGUAGE HIP) - endif() -endfunction() + # GPU tests if (SPBLAS_GPU_BACKEND) if (ENABLE_ROCSPARSE) set(GPUTEST_SOURCES device/spmv_test.cpp device/spgemm_test.cpp device/spgemm_reuse_test.cpp device/rocsparse/spgemm_4args_test.cpp) - add_rocsparse_lang(GPUTEST_SOURCES) + set_source_files_properties(${GPUTEST_SOURCES} PROPERTIES LANGUAGE HIP) else () set(GPUTEST_SOURCES device/spmv_test.cpp) endif () From 6bfe91c1b7ac50b7ee31d8fc4e246ebc07f370e3 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 24 Apr 2025 19:10:35 +0200 Subject: [PATCH 11/14] add cusparse spgemm --- CMakeLists.txt | 1 + include/spblas/vendor/cusparse/cusparse.hpp | 1 + include/spblas/vendor/cusparse/exception.hpp | 4 +- .../vendor/cusparse/multiply_spgemm.hpp | 211 ++++++++++++++++++ test/gtest/CMakeLists.txt | 2 +- test/gtest/device/spgemm_test.cpp | 40 ++-- 6 files changed, 236 insertions(+), 23 deletions(-) create mode 100644 include/spblas/vendor/cusparse/multiply_spgemm.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 7104450..a39415a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -90,6 +90,7 @@ endif() if (ENABLE_CUSPARSE) set(SPBLAS_GPU_BACKEND ON) find_package(CUDAToolkit REQUIRED) + enable_language(CUDA) target_link_libraries(spblas INTERFACE CUDA::cudart CUDA::cusparse CUDA::cublas) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSPBLAS_ENABLE_CUSPARSE") endif() diff --git a/include/spblas/vendor/cusparse/cusparse.hpp b/include/spblas/vendor/cusparse/cusparse.hpp index 7caa698..014b2ba 100644 --- a/include/spblas/vendor/cusparse/cusparse.hpp +++ b/include/spblas/vendor/cusparse/cusparse.hpp @@ -1,3 +1,4 @@ #pragma once #include "multiply.hpp" +#include "multiply_spgemm.hpp" diff --git a/include/spblas/vendor/cusparse/exception.hpp b/include/spblas/vendor/cusparse/exception.hpp index d90ac30..4be80eb 100644 --- a/include/spblas/vendor/cusparse/exception.hpp +++ b/include/spblas/vendor/cusparse/exception.hpp @@ -10,7 +10,7 @@ namespace spblas { namespace __cusparse { // Throw an exception if the cudaError_t is not cudaSuccess. -void throw_if_error(cudaError_t error_code, std::string prefix = "") { +inline void throw_if_error(cudaError_t error_code, std::string prefix = "") { if (error_code == cudaSuccess) { return; } @@ -21,7 +21,7 @@ void throw_if_error(cudaError_t error_code, std::string prefix = "") { } // Throw an exception if the cusparseStatus_t is not CUSPARSE_STATUS_SUCCESS. -void throw_if_error(cusparseStatus_t error_code) { +inline void throw_if_error(cusparseStatus_t error_code) { if (error_code == CUSPARSE_STATUS_SUCCESS) { return; } else if (error_code == CUSPARSE_STATUS_NOT_INITIALIZED) { diff --git a/include/spblas/vendor/cusparse/multiply_spgemm.hpp b/include/spblas/vendor/cusparse/multiply_spgemm.hpp new file mode 100644 index 0000000..8b2e181 --- /dev/null +++ b/include/spblas/vendor/cusparse/multiply_spgemm.hpp @@ -0,0 +1,211 @@ +#pragma once + +#include +#include +#include + +#include +#include + +#include +#include + +#include "cuda_allocator.hpp" +#include "exception.hpp" +#include "types.hpp" + +namespace spblas { + +class spgemm_state_t { +public: + spgemm_state_t() : spgemm_state_t(cusparse::cuda_allocator{}) {} + + spgemm_state_t(cusparse::cuda_allocator alloc) + : alloc_(alloc), buffer_size_1_(0), buffer_size_2_(0), + workspace_1_(nullptr), workspace_2_(nullptr), result_nnz_(0), + result_shape_(0, 0) { + cusparseHandle_t handle; + __cusparse::throw_if_error(cusparseCreate(&handle)); + if (auto stream = alloc.stream()) { + cusparseSetStream(handle, stream); + } + handle_ = handle_manager(handle, [](cusparseHandle_t handle) { + __cusparse::throw_if_error(cusparseDestroy(handle)); + }); + __cusparse::throw_if_error(cusparseSpGEMM_createDescr(&descr_)); + } + + spgemm_state_t(cusparse::cuda_allocator alloc, cusparseHandle_t handle) + : alloc_(alloc), buffer_size_1_(0), buffer_size_2_(0), + workspace_1_(nullptr), workspace_2_(nullptr), result_nnz_(0), + result_shape_(0, 0) { + handle_ = handle_manager(handle, [](cusparseHandle_t handle) { + // it is provided by user, we do not delete it at all. + }); + __cusparse::throw_if_error(cusparseSpGEMM_createDescr(&descr_)); + } + + ~spgemm_state_t() { + alloc_.deallocate(workspace_1_, buffer_size_1_); + alloc_.deallocate(workspace_2_, buffer_size_2_); + __cusparse::throw_if_error(cusparseDestroySpMat(mat_a_)); + __cusparse::throw_if_error(cusparseDestroySpMat(mat_b_)); + __cusparse::throw_if_error(cusparseDestroySpMat(mat_c_)); + __cusparse::throw_if_error(cusparseSpGEMM_destroyDescr(descr_)); + } + + auto result_shape() { + return this->result_shape_; + } + + auto result_nnz() { + return this->result_nnz_; + } + + template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v + void multiply_compute(A&& a, B&& b, C&& c) { + auto a_base = __detail::get_ultimate_base(a); + auto b_base = __detail::get_ultimate_base(b); + using matrix_type = decltype(a_base); + using input_type = decltype(b_base); + using output_type = std::remove_reference_t; + using value_type = typename matrix_type::scalar_type; + size_t buffer_size = 0; + + auto alpha_optional = __detail::get_scaling_factor(a, b); + value_type alpha = alpha_optional.value_or(1); + value_type beta = 1; + auto handle = this->handle_.get(); + // Create sparse matrix A in CSR format + __cusparse::throw_if_error(cusparseCreateCsr( + &mat_a_, __backend::shape(a_base)[0], __backend::shape(a_base)[1], + a_base.values().size(), a_base.rowptr().data(), a_base.colind().data(), + a_base.values().data(), + to_cusparse_indextype(), + to_cusparse_indextype(), + CUSPARSE_INDEX_BASE_ZERO, to_cuda_datatype())); + __cusparse::throw_if_error(cusparseCreateCsr( + &mat_b_, __backend::shape(b_base)[0], __backend::shape(b_base)[1], + b_base.values().size(), b_base.rowptr().data(), b_base.colind().data(), + b_base.values().data(), + to_cusparse_indextype(), + to_cusparse_indextype(), + CUSPARSE_INDEX_BASE_ZERO, to_cuda_datatype())); + __cusparse::throw_if_error(cusparseCreateCsr( + &mat_c_, __backend::shape(a)[0], __backend::shape(b)[1], 0, + c.rowptr().data(), NULL, NULL, + to_cusparse_indextype(), + to_cusparse_indextype(), + CUSPARSE_INDEX_BASE_ZERO, to_cuda_datatype())); + + // ask bufferSize1 bytes for external memory + size_t buffer_size_1 = 0; + __cusparse::throw_if_error(cusparseSpGEMM_workEstimation( + handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_, mat_b_, &beta, mat_c_, + to_cuda_datatype(), CUSPARSE_SPGEMM_DEFAULT, this->descr_, + &buffer_size_1, NULL)); + if (buffer_size_1 > this->buffer_size_1_) { + this->alloc_.deallocate(this->workspace_1_, buffer_size_1_); + this->buffer_size_1_ = buffer_size_1; + this->workspace_1_ = this->alloc_.allocate(buffer_size_1); + } + // inspect the matrices A and B to understand the memory requirement for + // the next step + __cusparse::throw_if_error(cusparseSpGEMM_workEstimation( + handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_, mat_b_, &beta, mat_c_, + to_cuda_datatype(), CUSPARSE_SPGEMM_DEFAULT, this->descr_, + &buffer_size_1, this->workspace_1_)); + + // ask buffer_size_2 bytes for external memory + size_t buffer_size_2 = 0; + __cusparse::throw_if_error(cusparseSpGEMM_compute( + handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_, mat_b_, &beta, mat_c_, + to_cuda_datatype(), CUSPARSE_SPGEMM_DEFAULT, this->descr_, + &buffer_size_2, NULL)); + if (buffer_size_2 > this->buffer_size_2_) { + this->alloc_.deallocate(this->workspace_2_, buffer_size_2_); + this->buffer_size_2_ = buffer_size_2; + this->workspace_2_ = this->alloc_.allocate(buffer_size_2); + } + + // compute the intermediate product of A * B + cusparseSpGEMM_compute( + handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_, mat_b_, &beta, mat_c_, + to_cuda_datatype(), CUSPARSE_SPGEMM_DEFAULT, this->descr_, + &buffer_size_2, this->workspace_2_); + // get matrix C non-zero entries c_nnz + int64_t c_num_rows, c_num_cols, c_nnz; + cusparseSpMatGetSize(mat_c_, &c_num_rows, &c_num_cols, &c_nnz); + this->result_nnz_ = c_nnz; + this->result_shape_ = index(c_num_rows, c_num_cols); + } + + // C = AB + template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v + void multiply_fill(A&& a, B&& b, C&& c) { + auto a_base = __detail::get_ultimate_base(a); + auto b_base = __detail::get_ultimate_base(b); + using matrix_type = decltype(a_base); + using input_type = decltype(b_base); + using output_type = std::remove_reference_t; + using value_type = typename matrix_type::scalar_type; + + auto alpha_optional = __detail::get_scaling_factor(a, b); + value_type alpha = alpha_optional.value_or(1); + value_type beta = 1; + auto handle = this->handle_.get(); + __cusparse::throw_if_error(cusparseCsrSetPointers( + mat_c_, c.rowptr().data(), c.colind().data(), c.values().data())); + + __cusparse::throw_if_error(cusparseSpGEMM_copy( + handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_, mat_b_, &beta, mat_c_, + to_cuda_datatype(), CUSPARSE_SPGEMM_DEFAULT, this->descr_)); + } + +private: + using handle_manager = + std::unique_ptr::element_type, + std::function>; + handle_manager handle_; + cusparse::cuda_allocator alloc_; + size_t buffer_size_1_; + size_t buffer_size_2_; + char* workspace_1_; + char* workspace_2_; + index result_shape_; + index_t result_nnz_; + cusparseSpMatDescr_t mat_a_; + cusparseSpMatDescr_t mat_b_; + cusparseSpMatDescr_t mat_c_; + cusparseSpGEMMDescr_t descr_; +}; + +template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v +void multiply_inspect(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c) {} + +template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v +void multiply_compute(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c) { + spgemm_handle.multiply_compute(a, b, c); +} + +template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v +void multiply_fill(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c) { + spgemm_handle.multiply_fill(a, b, c); +} + +} // namespace spblas diff --git a/test/gtest/CMakeLists.txt b/test/gtest/CMakeLists.txt index cd96de1..4a468da 100644 --- a/test/gtest/CMakeLists.txt +++ b/test/gtest/CMakeLists.txt @@ -20,7 +20,7 @@ if (SPBLAS_GPU_BACKEND) set(GPUTEST_SOURCES device/spmv_test.cpp device/spgemm_test.cpp device/spgemm_reuse_test.cpp device/rocsparse/spgemm_4args_test.cpp) set_source_files_properties(${GPUTEST_SOURCES} PROPERTIES LANGUAGE HIP) else () - set(GPUTEST_SOURCES device/spmv_test.cpp) + set(GPUTEST_SOURCES device/spmv_test.cpp device/spgemm_test.cpp) endif () list(APPEND TEST_SOURCES ${GPUTEST_SOURCES}) endif() diff --git a/test/gtest/device/spgemm_test.cpp b/test/gtest/device/spgemm_test.cpp index 98e1aed..03d5d1f 100644 --- a/test/gtest/device/spgemm_test.cpp +++ b/test/gtest/device/spgemm_test.cpp @@ -33,8 +33,10 @@ TEST(thrust_CsrView, SpGEMM) { d_b_colind.data().get(), b_shape, b_nnz); spblas::csr_view b(b_values, b_rowptr, b_colind, b_shape, b_nnz); - - thrust::device_vector d_c_rowptr(m + 1); + std::vector c_rowptr(m + 1); + // Can not create the vector with the size because it will use for_each to + // initilize the value in cuda. + thrust::device_vector d_c_rowptr(c_rowptr); spblas::csr_view d_c( nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); @@ -42,8 +44,10 @@ TEST(thrust_CsrView, SpGEMM) { spblas::spgemm_state_t state; spblas::multiply_compute(state, d_a, d_b, d_c); auto nnz = state.result_nnz(); - thrust::device_vector d_c_values(nnz); - thrust::device_vector d_c_colind(nnz); + std::vector c_values(nnz); + std::vector c_colind(nnz); + thrust::device_vector d_c_values(c_values); + thrust::device_vector d_c_colind(c_colind); std::span d_c_values_span(d_c_values.data().get(), nnz); std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); std::span d_c_colind_span(d_c_colind.data().get(), nnz); @@ -52,9 +56,6 @@ TEST(thrust_CsrView, SpGEMM) { spblas::multiply_fill(state, d_a, d_b, d_c); - std::vector c_values(nnz); - std::vector c_rowptr(m + 1); - std::vector c_colind(nnz); thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin()); thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin()); thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin()); @@ -121,8 +122,8 @@ TEST(thrust_CsrView, SpGEMM_AScaled) { d_b_colind.data().get(), b_shape, b_nnz); spblas::csr_view b(b_values, b_rowptr, b_colind, b_shape, b_nnz); - - thrust::device_vector d_c_rowptr(m + 1); + std::vector c_rowptr(m + 1); + thrust::device_vector d_c_rowptr(c_rowptr); spblas::csr_view d_c( nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); @@ -130,8 +131,10 @@ TEST(thrust_CsrView, SpGEMM_AScaled) { spblas::spgemm_state_t state; spblas::multiply_compute(state, spblas::scaled(alpha, d_a), d_b, d_c); auto nnz = state.result_nnz(); - thrust::device_vector d_c_values(nnz); - thrust::device_vector d_c_colind(nnz); + std::vector c_values(nnz); + std::vector c_colind(nnz); + thrust::device_vector d_c_values(c_values); + thrust::device_vector d_c_colind(c_colind); std::span d_c_values_span(d_c_values.data().get(), nnz); std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); std::span d_c_colind_span(d_c_colind.data().get(), nnz); @@ -140,9 +143,6 @@ TEST(thrust_CsrView, SpGEMM_AScaled) { spblas::multiply_fill(state, spblas::scaled(alpha, d_a), d_b, d_c); - std::vector c_values(nnz); - std::vector c_rowptr(m + 1); - std::vector c_colind(nnz); thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin()); thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin()); thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin()); @@ -210,7 +210,8 @@ TEST(thrust_CsrView, SpGEMM_BScaled) { spblas::csr_view b(b_values, b_rowptr, b_colind, b_shape, b_nnz); - thrust::device_vector d_c_rowptr(m + 1); + std::vector c_rowptr(m + 1); + thrust::device_vector d_c_rowptr(c_rowptr); spblas::csr_view d_c( nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); @@ -218,8 +219,10 @@ TEST(thrust_CsrView, SpGEMM_BScaled) { spblas::spgemm_state_t state; spblas::multiply_compute(state, d_a, spblas::scaled(alpha, d_b), d_c); auto nnz = state.result_nnz(); - thrust::device_vector d_c_values(nnz); - thrust::device_vector d_c_colind(nnz); + std::vector c_values(nnz); + std::vector c_colind(nnz); + thrust::device_vector d_c_values(c_values); + thrust::device_vector d_c_colind(c_colind); std::span d_c_values_span(d_c_values.data().get(), nnz); std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); std::span d_c_colind_span(d_c_colind.data().get(), nnz); @@ -228,9 +231,6 @@ TEST(thrust_CsrView, SpGEMM_BScaled) { spblas::multiply_fill(state, d_a, spblas::scaled(alpha, d_b), d_c); - std::vector c_values(nnz); - std::vector c_rowptr(m + 1); - std::vector c_colind(nnz); thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin()); thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin()); thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin()); From b731ce7e42f47481083f86680093ea7c32a6851d Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 5 Jun 2025 17:25:54 +0200 Subject: [PATCH 12/14] use the helper function to create descriptor --- include/spblas/vendor/cusparse/descriptor.hpp | 45 +++++++++++++++++++ .../vendor/cusparse/multiply_spgemm.hpp | 34 +++++--------- 2 files changed, 55 insertions(+), 24 deletions(-) create mode 100644 include/spblas/vendor/cusparse/descriptor.hpp diff --git a/include/spblas/vendor/cusparse/descriptor.hpp b/include/spblas/vendor/cusparse/descriptor.hpp new file mode 100644 index 0000000..e3ed2a1 --- /dev/null +++ b/include/spblas/vendor/cusparse/descriptor.hpp @@ -0,0 +1,45 @@ +#pragma once + +#include + +#include + +#include +#include + +#include "exception.hpp" +#include "types.hpp" + +namespace spblas { +namespace __cusparse { + +// create matrix descriptor from spblas csr view +template + requires __detail::is_csr_view_v +cusparseSpMatDescr_t create_matrix_descr(mat&& a) { + using matrix_type = std::remove_cvref_t; + cusparseSpMatDescr_t descr; + throw_if_error(cusparseCreateCsr( + &descr, __backend::shape(a)[0], __backend::shape(a)[1], a.values().size(), + a.rowptr().data(), a.colind().data(), a.values().data(), + to_cusparse_indextype(), + to_cusparse_indextype(), + CUSPARSE_INDEX_BASE_ZERO, + to_cuda_datatype())); + return descr; +} + +// create dense vector from mdspan +template + requires __ranges::contiguous_range +cusparseDnVecDescr_t create_vector_descr(vec&& v) { + using vector_type = std::remove_cvref_t; + cusparseDnVecDescr_t descr; + throw_if_error(cusparseCreateDnVec( + &descr, v.size(), v.data(), + to_cuda_datatype())); + return descr; +} + +} // namespace __cusparse +} // namespace spblas diff --git a/include/spblas/vendor/cusparse/multiply_spgemm.hpp b/include/spblas/vendor/cusparse/multiply_spgemm.hpp index 8b2e181..f49b1d5 100644 --- a/include/spblas/vendor/cusparse/multiply_spgemm.hpp +++ b/include/spblas/vendor/cusparse/multiply_spgemm.hpp @@ -11,6 +11,7 @@ #include #include "cuda_allocator.hpp" +#include "descriptor.hpp" #include "exception.hpp" #include "types.hpp" @@ -78,27 +79,12 @@ class spgemm_state_t { value_type alpha = alpha_optional.value_or(1); value_type beta = 1; auto handle = this->handle_.get(); - // Create sparse matrix A in CSR format - __cusparse::throw_if_error(cusparseCreateCsr( - &mat_a_, __backend::shape(a_base)[0], __backend::shape(a_base)[1], - a_base.values().size(), a_base.rowptr().data(), a_base.colind().data(), - a_base.values().data(), - to_cusparse_indextype(), - to_cusparse_indextype(), - CUSPARSE_INDEX_BASE_ZERO, to_cuda_datatype())); - __cusparse::throw_if_error(cusparseCreateCsr( - &mat_b_, __backend::shape(b_base)[0], __backend::shape(b_base)[1], - b_base.values().size(), b_base.rowptr().data(), b_base.colind().data(), - b_base.values().data(), - to_cusparse_indextype(), - to_cusparse_indextype(), - CUSPARSE_INDEX_BASE_ZERO, to_cuda_datatype())); - __cusparse::throw_if_error(cusparseCreateCsr( - &mat_c_, __backend::shape(a)[0], __backend::shape(b)[1], 0, - c.rowptr().data(), NULL, NULL, - to_cusparse_indextype(), - to_cusparse_indextype(), - CUSPARSE_INDEX_BASE_ZERO, to_cuda_datatype())); + __cusparse::throw_if_error(cusparseDestroySpMat(mat_a_)); + __cusparse::throw_if_error(cusparseDestroySpMat(mat_b_)); + __cusparse::throw_if_error(cusparseDestroySpMat(mat_c_)); + mat_a_ = __cusparse::create_matrix_descr(a_base); + mat_b_ = __cusparse::create_matrix_descr(b_base); + mat_c_ = __cusparse::create_matrix_descr(c); // ask bufferSize1 bytes for external memory size_t buffer_size_1 = 0; @@ -183,9 +169,9 @@ class spgemm_state_t { char* workspace_2_; index result_shape_; index_t result_nnz_; - cusparseSpMatDescr_t mat_a_; - cusparseSpMatDescr_t mat_b_; - cusparseSpMatDescr_t mat_c_; + cusparseSpMatDescr_t mat_a_ = nullptr; + cusparseSpMatDescr_t mat_b_ = nullptr; + cusparseSpMatDescr_t mat_c_ = nullptr; cusparseSpGEMMDescr_t descr_; }; From 144aa14db682a63255f70ee8b91d57a4b6fcf0eb Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 5 Jun 2025 18:23:33 +0200 Subject: [PATCH 13/14] create cusparse spgemm reuse --- .../vendor/cusparse/multiply_spgemm.hpp | 186 +++++++++++++++++- test/gtest/CMakeLists.txt | 2 +- test/gtest/device/spgemm_reuse_test.cpp | 70 +++---- 3 files changed, 220 insertions(+), 38 deletions(-) diff --git a/include/spblas/vendor/cusparse/multiply_spgemm.hpp b/include/spblas/vendor/cusparse/multiply_spgemm.hpp index f49b1d5..e5fa1ac 100644 --- a/include/spblas/vendor/cusparse/multiply_spgemm.hpp +++ b/include/spblas/vendor/cusparse/multiply_spgemm.hpp @@ -22,9 +22,10 @@ class spgemm_state_t { spgemm_state_t() : spgemm_state_t(cusparse::cuda_allocator{}) {} spgemm_state_t(cusparse::cuda_allocator alloc) - : alloc_(alloc), buffer_size_1_(0), buffer_size_2_(0), - workspace_1_(nullptr), workspace_2_(nullptr), result_nnz_(0), - result_shape_(0, 0) { + : alloc_(alloc), buffer_size_1_(0), buffer_size_2_(0), buffer_size_3_(0), + buffer_size_4_(0), buffer_size_5_(0), workspace_1_(nullptr), + workspace_2_(nullptr), workspace_3_(nullptr), workspace_4_(nullptr), + workspace_5_(nullptr), result_nnz_(0), result_shape_(0, 0) { cusparseHandle_t handle; __cusparse::throw_if_error(cusparseCreate(&handle)); if (auto stream = alloc.stream()) { @@ -157,6 +158,156 @@ class spgemm_state_t { to_cuda_datatype(), CUSPARSE_SPGEMM_DEFAULT, this->descr_)); } + template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v + void multiply_symbolic_compute(A&& a, B&& b, C&& c) { + auto a_base = __detail::get_ultimate_base(a); + auto b_base = __detail::get_ultimate_base(b); + using matrix_type = decltype(a_base); + using input_type = decltype(b_base); + using output_type = std::remove_reference_t; + using value_type = typename matrix_type::scalar_type; + size_t buffer_size = 0; + + auto alpha_optional = __detail::get_scaling_factor(a, b); + value_type alpha = alpha_optional.value_or(1); + value_type beta = 1; + auto handle = this->handle_.get(); + __cusparse::throw_if_error(cusparseDestroySpMat(mat_a_)); + __cusparse::throw_if_error(cusparseDestroySpMat(mat_b_)); + __cusparse::throw_if_error(cusparseDestroySpMat(mat_c_)); + mat_a_ = __cusparse::create_matrix_descr(a_base); + mat_b_ = __cusparse::create_matrix_descr(b_base); + mat_c_ = __cusparse::create_matrix_descr(c); + + // ask bufferSize1 bytes for external memory + size_t buffer_size_1 = 0; + __cusparse::throw_if_error(cusparseSpGEMMreuse_workEstimation( + handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, mat_a_, mat_b_, mat_c_, + CUSPARSE_SPGEMM_DEFAULT, this->descr_, &buffer_size_1, NULL)); + if (buffer_size_1 > this->buffer_size_1_) { + this->alloc_.deallocate(this->workspace_1_, buffer_size_1_); + this->buffer_size_1_ = buffer_size_1; + this->workspace_1_ = this->alloc_.allocate(buffer_size_1); + } + // inspect the matrices A and B to understand the memory requirement for + // the next step + __cusparse::throw_if_error(cusparseSpGEMMreuse_workEstimation( + handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, mat_a_, mat_b_, mat_c_, + CUSPARSE_SPGEMM_DEFAULT, this->descr_, &buffer_size_1, + this->workspace_1_)); + + // ask buffer_size_2/3/4 bytes for external memory + size_t buffer_size_2 = 0; + size_t buffer_size_3 = 0; + size_t buffer_size_4 = 0; + cusparseSpGEMMreuse_nnz(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, mat_a_, mat_b_, + mat_c_, CUSPARSE_SPGEMM_DEFAULT, this->descr_, + &buffer_size_2, NULL, &buffer_size_3, NULL, + &buffer_size_4, NULL); + if (buffer_size_2 > this->buffer_size_2_) { + this->alloc_.deallocate(this->workspace_2_, buffer_size_2_); + this->buffer_size_2_ = buffer_size_2; + this->workspace_2_ = this->alloc_.allocate(buffer_size_2); + } + if (buffer_size_3 > this->buffer_size_3_) { + this->alloc_.deallocate(this->workspace_3_, buffer_size_3_); + this->buffer_size_3_ = buffer_size_3; + this->workspace_3_ = this->alloc_.allocate(buffer_size_3); + } + if (buffer_size_4 > this->buffer_size_4_) { + this->alloc_.deallocate(this->workspace_4_, buffer_size_4_); + this->buffer_size_4_ = buffer_size_4; + this->workspace_4_ = this->alloc_.allocate(buffer_size_4); + } + + // compute nnz + cusparseSpGEMMreuse_nnz(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, mat_a_, mat_b_, + mat_c_, CUSPARSE_SPGEMM_DEFAULT, this->descr_, + &buffer_size_2, this->workspace_2_, &buffer_size_3, + this->workspace_3_, &buffer_size_4, + this->workspace_4_); + // get matrix C non-zero entries c_nnz + int64_t c_num_rows, c_num_cols, c_nnz; + cusparseSpMatGetSize(mat_c_, &c_num_rows, &c_num_cols, &c_nnz); + this->result_nnz_ = c_nnz; + this->result_shape_ = index(c_num_rows, c_num_cols); + } + + template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v + void multiply_symbolic_fill(A&& a, B&& b, C&& c) { + auto a_base = __detail::get_ultimate_base(a); + auto b_base = __detail::get_ultimate_base(b); + using matrix_type = decltype(a_base); + using input_type = decltype(b_base); + using output_type = std::remove_reference_t; + using value_type = typename matrix_type::scalar_type; + + auto alpha_optional = __detail::get_scaling_factor(a, b); + value_type alpha = alpha_optional.value_or(1); + value_type beta = 0; + + __cusparse::throw_if_error(cusparseCsrSetPointers( + this->mat_c_, c.rowptr().data(), c.colind().data(), c.values().data())); + + auto handle = this->handle_.get(); + size_t buffer_size_5 = 0; + cusparseSpGEMMreuse_copy(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, mat_a_, mat_b_, + mat_c_, CUSPARSE_SPGEMM_DEFAULT, this->descr_, + &buffer_size_5, NULL); + if (buffer_size_5 > this->buffer_size_5_) { + this->alloc_.deallocate(this->workspace_5_, buffer_size_5_); + this->buffer_size_5_ = buffer_size_5; + this->workspace_5_ = this->alloc_.allocate(buffer_size_5); + } + cusparseSpGEMMreuse_copy(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, mat_a_, mat_b_, + mat_c_, CUSPARSE_SPGEMM_DEFAULT, this->descr_, + &buffer_size_5, this->workspace_5_); + } + + template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v + void multiply_numeric(A&& a, B&& b, C&& c) { + auto a_base = __detail::get_ultimate_base(a); + auto b_base = __detail::get_ultimate_base(b); + using matrix_type = decltype(a_base); + using input_type = decltype(b_base); + using output_type = std::remove_reference_t; + using value_type = typename matrix_type::scalar_type; + + auto alpha_optional = __detail::get_scaling_factor(a, b); + tensor_scalar_t alpha = alpha_optional.value_or(1); + value_type alpha_val = alpha; + value_type beta = 0; + + auto handle = this->handle_.get(); + + // Update the pointer from the matrix but they must contains the same + // sparsity as the previous call. + __cusparse::throw_if_error( + cusparseCsrSetPointers(this->mat_a_, a_base.rowptr().data(), + a_base.colind().data(), a_base.values().data())); + __cusparse::throw_if_error( + cusparseCsrSetPointers(this->mat_b_, b_base.rowptr().data(), + b_base.colind().data(), b_base.values().data())); + __cusparse::throw_if_error(cusparseCsrSetPointers( + this->mat_c_, c.rowptr().data(), c.colind().data(), c.values().data())); + cusparseSpGEMMreuse_compute( + handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_, mat_b_, &beta, mat_c_, + to_cuda_datatype(), CUSPARSE_SPGEMM_DEFAULT, this->descr_); + } + private: using handle_manager = std::unique_ptr::element_type, @@ -165,8 +316,14 @@ class spgemm_state_t { cusparse::cuda_allocator alloc_; size_t buffer_size_1_; size_t buffer_size_2_; + size_t buffer_size_3_; + size_t buffer_size_4_; + size_t buffer_size_5_; char* workspace_1_; char* workspace_2_; + char* workspace_3_; + char* workspace_4_; + char* workspace_5_; index result_shape_; index_t result_nnz_; cusparseSpMatDescr_t mat_a_ = nullptr; @@ -194,4 +351,27 @@ void multiply_fill(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c) { spgemm_handle.multiply_fill(a, b, c); } +template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v +void multiply_symbolic_compute(spgemm_state_t& spgemm_handle, A&& a, B&& b, + C&& c) { + spgemm_handle.multiply_symbolic_compute(a, b, c); +} + +template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v +void multiply_symbolic_fill(spgemm_state_t& spgemm_handle, A&& a, B&& b, + C&& c) { + spgemm_handle.multiply_symbolic_fill(a, b, c); +} + +template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v +void multiply_numeric(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c) { + spgemm_handle.multiply_numeric(a, b, c); +} + } // namespace spblas diff --git a/test/gtest/CMakeLists.txt b/test/gtest/CMakeLists.txt index 4a468da..884f549 100644 --- a/test/gtest/CMakeLists.txt +++ b/test/gtest/CMakeLists.txt @@ -20,7 +20,7 @@ if (SPBLAS_GPU_BACKEND) set(GPUTEST_SOURCES device/spmv_test.cpp device/spgemm_test.cpp device/spgemm_reuse_test.cpp device/rocsparse/spgemm_4args_test.cpp) set_source_files_properties(${GPUTEST_SOURCES} PROPERTIES LANGUAGE HIP) else () - set(GPUTEST_SOURCES device/spmv_test.cpp device/spgemm_test.cpp) + set(GPUTEST_SOURCES device/spmv_test.cpp device/spgemm_test.cpp device/spgemm_reuse_test.cpp) endif () list(APPEND TEST_SOURCES ${GPUTEST_SOURCES}) endif() diff --git a/test/gtest/device/spgemm_reuse_test.cpp b/test/gtest/device/spgemm_reuse_test.cpp index b798d6e..3a6e339 100644 --- a/test/gtest/device/spgemm_reuse_test.cpp +++ b/test/gtest/device/spgemm_reuse_test.cpp @@ -34,7 +34,8 @@ TEST(thrust_CsrView, SpGEMMReuse) { spblas::csr_view b(b_values, b_rowptr, b_colind, b_shape, b_nnz); - thrust::device_vector d_c_rowptr(m + 1); + std::vector c_rowptr(m + 1); + thrust::device_vector d_c_rowptr(c_rowptr); spblas::csr_view d_c( nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); @@ -42,8 +43,10 @@ TEST(thrust_CsrView, SpGEMMReuse) { spblas::spgemm_state_t state; spblas::multiply_symbolic_compute(state, d_a, d_b, d_c); auto nnz = state.result_nnz(); - thrust::device_vector d_c_values(nnz); - thrust::device_vector d_c_colind(nnz); + std::vector c_values(nnz); + std::vector c_colind(nnz); + thrust::device_vector d_c_values(c_values); + thrust::device_vector d_c_colind(c_colind); std::span d_c_values_span(d_c_values.data().get(), nnz); std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); std::span d_c_colind_span(d_c_colind.data().get(), nnz); @@ -68,9 +71,6 @@ TEST(thrust_CsrView, SpGEMMReuse) { thrust::copy(b_values.begin(), b_values.end(), d_b_values.begin()); } spblas::multiply_numeric(state, d_a, d_b, d_c); - std::vector c_values(nnz); - std::vector c_rowptr(m + 1); - std::vector c_colind(nnz); thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin()); thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin()); thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin()); @@ -138,8 +138,8 @@ TEST(thrust_CsrView, SpGEMMReuse_AScaled) { d_b_colind.data().get(), b_shape, b_nnz); spblas::csr_view b(b_values, b_rowptr, b_colind, b_shape, b_nnz); - - thrust::device_vector d_c_rowptr(m + 1); + std::vector c_rowptr(m + 1); + thrust::device_vector d_c_rowptr(c_rowptr); spblas::csr_view d_c( nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); @@ -147,8 +147,10 @@ TEST(thrust_CsrView, SpGEMMReuse_AScaled) { spblas::spgemm_state_t state; spblas::multiply_symbolic_compute(state, scaled(alpha, d_a), d_b, d_c); auto nnz = state.result_nnz(); - thrust::device_vector d_c_values(nnz); - thrust::device_vector d_c_colind(nnz); + std::vector c_values(nnz); + std::vector c_colind(nnz); + thrust::device_vector d_c_values(c_values); + thrust::device_vector d_c_colind(c_colind); std::span d_c_values_span(d_c_values.data().get(), nnz); std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); std::span d_c_colind_span(d_c_colind.data().get(), nnz); @@ -173,9 +175,6 @@ TEST(thrust_CsrView, SpGEMMReuse_AScaled) { thrust::copy(b_values.begin(), b_values.end(), d_b_values.begin()); } spblas::multiply_numeric(state, scaled(alpha, d_a), d_b, d_c); - std::vector c_values(nnz); - std::vector c_rowptr(m + 1); - std::vector c_colind(nnz); thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin()); thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin()); thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin()); @@ -243,8 +242,8 @@ TEST(thrust_CsrView, SpGEMMReuse_BScaled) { d_b_colind.data().get(), b_shape, b_nnz); spblas::csr_view b(b_values, b_rowptr, b_colind, b_shape, b_nnz); - - thrust::device_vector d_c_rowptr(m + 1); + std::vector c_rowptr(m + 1); + thrust::device_vector d_c_rowptr(c_rowptr); spblas::csr_view d_c( nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); @@ -252,8 +251,10 @@ TEST(thrust_CsrView, SpGEMMReuse_BScaled) { spblas::spgemm_state_t state; spblas::multiply_symbolic_compute(state, d_a, scaled(alpha, d_b), d_c); auto nnz = state.result_nnz(); - thrust::device_vector d_c_values(nnz); - thrust::device_vector d_c_colind(nnz); + std::vector c_values(nnz); + std::vector c_colind(nnz); + thrust::device_vector d_c_values(c_values); + thrust::device_vector d_c_colind(c_colind); std::span d_c_values_span(d_c_values.data().get(), nnz); std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); std::span d_c_colind_span(d_c_colind.data().get(), nnz); @@ -278,9 +279,6 @@ TEST(thrust_CsrView, SpGEMMReuse_BScaled) { thrust::copy(b_values.begin(), b_values.end(), d_b_values.begin()); } spblas::multiply_numeric(state, d_a, scaled(alpha, d_b), d_c); - std::vector c_values(nnz); - std::vector c_rowptr(m + 1); - std::vector c_colind(nnz); thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin()); thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin()); thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin()); @@ -348,7 +346,8 @@ TEST(thrust_CsrView, SpGEMMReuseAndChangePointer) { spblas::csr_view b(b_values, b_rowptr, b_colind, b_shape, b_nnz); - thrust::device_vector d_c_rowptr(m + 1); + std::vector c_rowptr(m + 1); + thrust::device_vector d_c_rowptr(c_rowptr); spblas::csr_view d_c( nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); @@ -356,8 +355,10 @@ TEST(thrust_CsrView, SpGEMMReuseAndChangePointer) { spblas::spgemm_state_t state; spblas::multiply_symbolic_compute(state, d_a, d_b, d_c); auto nnz = state.result_nnz(); - thrust::device_vector d_c_values(nnz); - thrust::device_vector d_c_colind(nnz); + std::vector c_values(nnz); + std::vector c_colind(nnz); + thrust::device_vector d_c_values(c_values); + thrust::device_vector d_c_colind(c_colind); std::span d_c_values_span(d_c_values.data().get(), nnz); std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); std::span d_c_colind_span(d_c_colind.data().get(), nnz); @@ -365,6 +366,9 @@ TEST(thrust_CsrView, SpGEMMReuseAndChangePointer) { nnz); spblas::multiply_symbolic_fill(state, d_a, d_b, d_c); + // move the sparsity back to host for later copy + thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin()); + thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin()); std::mt19937 g(0); for (int i = 0; i < 3; i++) { // regenerate value of a and b; @@ -376,16 +380,17 @@ TEST(thrust_CsrView, SpGEMMReuseAndChangePointer) { v = val_dist(g); } // create different pointers than the symbolic phase, but they still - // hold the same sparsity + // hold the same sparsity. + // note. cuda without nvcc can only copy from host to device thrust::device_vector d_a_values_new(a_values); - thrust::device_vector d_a_colind_new(d_a_colind); - thrust::device_vector d_a_rowptr_new(d_a_rowptr); + thrust::device_vector d_a_colind_new(a_colind); + thrust::device_vector d_a_rowptr_new(a_rowptr); thrust::device_vector d_b_values_new(b_values); - thrust::device_vector d_b_colind_new(d_b_colind); - thrust::device_vector d_b_rowptr_new(d_b_rowptr); - thrust::device_vector d_c_values_new(d_c_values); - thrust::device_vector d_c_colind_new(d_c_colind); - thrust::device_vector d_c_rowptr_new(d_c_rowptr); + thrust::device_vector d_b_colind_new(b_colind); + thrust::device_vector d_b_rowptr_new(b_rowptr); + thrust::device_vector d_c_values_new(c_values); + thrust::device_vector d_c_colind_new(c_colind); + thrust::device_vector d_c_rowptr_new(c_rowptr); spblas::csr_view d_a( d_a_values_new.data().get(), d_a_rowptr_new.data().get(), d_a_colind_new.data().get(), a_shape, a_nnz); @@ -398,9 +403,6 @@ TEST(thrust_CsrView, SpGEMMReuseAndChangePointer) { // call numeric on new data spblas::multiply_numeric(state, d_a, d_b, d_c); // move c back to host memory - std::vector c_values(nnz); - std::vector c_rowptr(m + 1); - std::vector c_colind(nnz); thrust::copy(d_c_values_new.begin(), d_c_values_new.end(), c_values.begin()); thrust::copy(d_c_rowptr_new.begin(), d_c_rowptr_new.end(), From 7845296f6a9752e6f532956c868484209f8f23aa Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 21 Aug 2025 16:35:23 +0200 Subject: [PATCH 14/14] reuse the current changes on main --- include/spblas/vendor/cusparse/descriptor.hpp | 45 ------------------- .../vendor/cusparse/multiply_spgemm.hpp | 42 ++++++++--------- 2 files changed, 22 insertions(+), 65 deletions(-) delete mode 100644 include/spblas/vendor/cusparse/descriptor.hpp diff --git a/include/spblas/vendor/cusparse/descriptor.hpp b/include/spblas/vendor/cusparse/descriptor.hpp deleted file mode 100644 index e3ed2a1..0000000 --- a/include/spblas/vendor/cusparse/descriptor.hpp +++ /dev/null @@ -1,45 +0,0 @@ -#pragma once - -#include - -#include - -#include -#include - -#include "exception.hpp" -#include "types.hpp" - -namespace spblas { -namespace __cusparse { - -// create matrix descriptor from spblas csr view -template - requires __detail::is_csr_view_v -cusparseSpMatDescr_t create_matrix_descr(mat&& a) { - using matrix_type = std::remove_cvref_t; - cusparseSpMatDescr_t descr; - throw_if_error(cusparseCreateCsr( - &descr, __backend::shape(a)[0], __backend::shape(a)[1], a.values().size(), - a.rowptr().data(), a.colind().data(), a.values().data(), - to_cusparse_indextype(), - to_cusparse_indextype(), - CUSPARSE_INDEX_BASE_ZERO, - to_cuda_datatype())); - return descr; -} - -// create dense vector from mdspan -template - requires __ranges::contiguous_range -cusparseDnVecDescr_t create_vector_descr(vec&& v) { - using vector_type = std::remove_cvref_t; - cusparseDnVecDescr_t descr; - throw_if_error(cusparseCreateDnVec( - &descr, v.size(), v.data(), - to_cuda_datatype())); - return descr; -} - -} // namespace __cusparse -} // namespace spblas diff --git a/include/spblas/vendor/cusparse/multiply_spgemm.hpp b/include/spblas/vendor/cusparse/multiply_spgemm.hpp index e5fa1ac..d8b9f71 100644 --- a/include/spblas/vendor/cusparse/multiply_spgemm.hpp +++ b/include/spblas/vendor/cusparse/multiply_spgemm.hpp @@ -11,7 +11,7 @@ #include #include "cuda_allocator.hpp" -#include "descriptor.hpp" +#include "detail/cusparse_tensors.hpp" #include "exception.hpp" #include "types.hpp" @@ -83,17 +83,17 @@ class spgemm_state_t { __cusparse::throw_if_error(cusparseDestroySpMat(mat_a_)); __cusparse::throw_if_error(cusparseDestroySpMat(mat_b_)); __cusparse::throw_if_error(cusparseDestroySpMat(mat_c_)); - mat_a_ = __cusparse::create_matrix_descr(a_base); - mat_b_ = __cusparse::create_matrix_descr(b_base); - mat_c_ = __cusparse::create_matrix_descr(c); + mat_a_ = __cusparse::create_cusparse_handle(a_base); + mat_b_ = __cusparse::create_cusparse_handle(b_base); + mat_c_ = __cusparse::create_cusparse_handle(c); // ask bufferSize1 bytes for external memory size_t buffer_size_1 = 0; __cusparse::throw_if_error(cusparseSpGEMM_workEstimation( handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_, mat_b_, &beta, mat_c_, - to_cuda_datatype(), CUSPARSE_SPGEMM_DEFAULT, this->descr_, - &buffer_size_1, NULL)); + detail::cuda_data_type_v, CUSPARSE_SPGEMM_DEFAULT, + this->descr_, &buffer_size_1, NULL)); if (buffer_size_1 > this->buffer_size_1_) { this->alloc_.deallocate(this->workspace_1_, buffer_size_1_); this->buffer_size_1_ = buffer_size_1; @@ -104,16 +104,16 @@ class spgemm_state_t { __cusparse::throw_if_error(cusparseSpGEMM_workEstimation( handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_, mat_b_, &beta, mat_c_, - to_cuda_datatype(), CUSPARSE_SPGEMM_DEFAULT, this->descr_, - &buffer_size_1, this->workspace_1_)); + detail::cuda_data_type_v, CUSPARSE_SPGEMM_DEFAULT, + this->descr_, &buffer_size_1, this->workspace_1_)); // ask buffer_size_2 bytes for external memory size_t buffer_size_2 = 0; __cusparse::throw_if_error(cusparseSpGEMM_compute( handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_, mat_b_, &beta, mat_c_, - to_cuda_datatype(), CUSPARSE_SPGEMM_DEFAULT, this->descr_, - &buffer_size_2, NULL)); + detail::cuda_data_type_v, CUSPARSE_SPGEMM_DEFAULT, + this->descr_, &buffer_size_2, NULL)); if (buffer_size_2 > this->buffer_size_2_) { this->alloc_.deallocate(this->workspace_2_, buffer_size_2_); this->buffer_size_2_ = buffer_size_2; @@ -124,8 +124,8 @@ class spgemm_state_t { cusparseSpGEMM_compute( handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_, mat_b_, &beta, mat_c_, - to_cuda_datatype(), CUSPARSE_SPGEMM_DEFAULT, this->descr_, - &buffer_size_2, this->workspace_2_); + detail::cuda_data_type_v, CUSPARSE_SPGEMM_DEFAULT, + this->descr_, &buffer_size_2, this->workspace_2_); // get matrix C non-zero entries c_nnz int64_t c_num_rows, c_num_cols, c_nnz; cusparseSpMatGetSize(mat_c_, &c_num_rows, &c_num_cols, &c_nnz); @@ -155,7 +155,8 @@ class spgemm_state_t { __cusparse::throw_if_error(cusparseSpGEMM_copy( handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_, mat_b_, &beta, mat_c_, - to_cuda_datatype(), CUSPARSE_SPGEMM_DEFAULT, this->descr_)); + detail::cuda_data_type_v, CUSPARSE_SPGEMM_DEFAULT, + this->descr_)); } template @@ -177,9 +178,9 @@ class spgemm_state_t { __cusparse::throw_if_error(cusparseDestroySpMat(mat_a_)); __cusparse::throw_if_error(cusparseDestroySpMat(mat_b_)); __cusparse::throw_if_error(cusparseDestroySpMat(mat_c_)); - mat_a_ = __cusparse::create_matrix_descr(a_base); - mat_b_ = __cusparse::create_matrix_descr(b_base); - mat_c_ = __cusparse::create_matrix_descr(c); + mat_a_ = __cusparse::create_cusparse_handle(a_base); + mat_b_ = __cusparse::create_cusparse_handle(b_base); + mat_c_ = __cusparse::create_cusparse_handle(c); // ask bufferSize1 bytes for external memory size_t buffer_size_1 = 0; @@ -302,10 +303,11 @@ class spgemm_state_t { b_base.colind().data(), b_base.values().data())); __cusparse::throw_if_error(cusparseCsrSetPointers( this->mat_c_, c.rowptr().data(), c.colind().data(), c.values().data())); - cusparseSpGEMMreuse_compute( - handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_, mat_b_, &beta, mat_c_, - to_cuda_datatype(), CUSPARSE_SPGEMM_DEFAULT, this->descr_); + cusparseSpGEMMreuse_compute(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, + mat_a_, mat_b_, &beta, mat_c_, + detail::cuda_data_type_v, + CUSPARSE_SPGEMM_DEFAULT, this->descr_); } private: